In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Codeup Blog Articles

## Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

- https://codeup.edu/featured/women-in-tech-panelist-spotlight/
- https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/
- https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/
- https://codeup.edu/events/women-in-tech-madeleine/
- https://codeup.edu/codeup-news/panelist-spotlight-4/

## Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:
```

{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}
```
### Plus any additional properties you think might be helpful.

In [2]:
def get_blog_articles(urls):
    articles_data = []
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    for url in urls:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = soup.find('h1').text.strip()
        content = ' '.join([p.text.strip() for p in soup.find_all('p')])
        
        article = {
            'title': title,
            'content': content
        }
        articles_data.append(article)
    
    return articles_data

urls_to_scrape = [
    "https://codeup.edu/alumni-stories/how-i-paid-43-for-my-codeup-tuition/",
    "https://codeup.edu/featured/women-in-tech-panelist-spotlight/",
    "https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/",
    "https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/",
    "https://codeup.edu/events/women-in-tech-madeleine/",
    "https://codeup.edu/codeup-news/panelist-spotlight-4/",
]


articles_data = get_blog_articles(urls_to_scrape)

for idx, article in enumerate(articles_data, start=1):
    print(f"Article {idx}")
    print(article)
    print()


Article 1

Article 2
{'title': 'Women in tech: Panelist Spotlight – Magdalena Rahn', 'content': 'Mar 28, 2023 | Events, Featured Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!  Meet Magdalena! Magdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian. We asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid founda

# News Articles

## We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

## Write a function that scrapes the news articles for the following topics:
```
Business
Sports
Technology
Entertainment
```

In [3]:
def scrape_articles(urls):
    articles_data = []
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    for url in urls:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        
        
        title_element = soup.find('span', itemprop='headline')
        if title_element:
            title = title_element.text.strip()
        else:
            title = "Title not found"
        content = soup.find(itemprop="articleBody")
        if content:
            content = content.text.strip()
        else:
            content = "Title not found"
        
        category = url.split('/')[-1]
        
        article = {
            'title': title,
            'content': content,
            'category': category
            
        }
        articles_data.append(article)
    
    return articles_data

urls_to_scrape = [
    "https://inshorts.com/en/read/business",
    "https://inshorts.com/en/read/sports",
    "https://inshorts.com/en/read/technology",
    "https://inshorts.com/en/read/entertainment",
]


articles_data = scrape_articles(urls_to_scrape)

for idx, article in enumerate(articles_data, start=1):
    print(f"Article {idx}")
    print(article)
    


Article 1
{'title': 'EX-WSJ reporter ends Indian hacker lawsuit against Dechert', 'content': 'Former Wall Street Journal reporter Jay Solomon ended his lawsuit against Dechert and two of its former partners. Solomon had sued Dechert and others last year, alleging that the firm worked with hackers from India to steal emails between him and one of his sources. Solomon alleged the messages were later circulated in a successful effort to get him fired.', 'category': 'business'}
Article 2
{'title': "Hardik's bowling changes in 5th T20I vs WI looked confused: Mukund", 'content': 'Ex-India opener Abhinav Mukund said Hardik Pandya looked confused with regard to his bowling changes in fifth T20I against West Indies. "I would\'ve started with Axar Patel which was the winning formula in...fourth T20I. Axar Patel should\'ve started bowling with the new ball [alongside] Arshdeep Singh," he added. West Indies defeated India by eight wickets in the fifth T20I.', 'category': 'sports'}
Article 3
{'titl

## The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

```
{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}
```
### Hints:
```
Start by inspecting the website in your browser. Figure out which elements will be useful.
Start by creating a function that handles a single article and produces a dictionary like the one above.
```

# Next create a function that will find all the articles on a single page and call the function you created in the last step for every article on the page.

In [4]:
url ="https://inshorts.com/en/read/sports"
requests.get(url)


<Response [200]>

In [5]:
url = "https://inshorts.com/en/read/sports"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

articles = []

article_cards = soup.find_all(itemtype = 'http://schema.org/NewsArticle')

for card in article_cards:
    headline = card.find('span', itemprop='headline').text
    summary = card.find('div', itemprop='articleBody').text
    category = url.split('/')[-1]
    
    articles.append({
        'headline': headline,
        'summary': summary,
        'category': category,
    })



In [6]:
articles

[{'headline': "Hardik's bowling changes in 5th T20I vs WI looked confused: Mukund",
  'summary': 'Ex-India opener Abhinav Mukund said Hardik Pandya looked confused with regard to his bowling changes in fifth T20I against West Indies. "I would\'ve started with Axar Patel which was the winning formula in...fourth T20I. Axar Patel should\'ve started bowling with the new ball [alongside] Arshdeep Singh," he added. West Indies defeated India by eight wickets in the fifth T20I.',
  'category': 'sports'},
 {'headline': "De Bruyne's injury vs Burnley termed 'serious', out for 3-4 months",
  'summary': "Midfielder Kevin De Bruyne could be sidelined for three to four months after suffering a hamstring injury in their season-opening Premier League game. The Belgium international left the field in the 36th minute of City's 3-0 win against Burnley. Manager Pep Guardiola confirmed that De Bruyne suffered a recurrence of the injury he picked up during the Champions League final.",
  'category': 'spor

In [9]:
def scrape_inshorts_articles(urls):
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        articles = []

        article_cards = soup.find_all(itemtype='http://schema.org/NewsArticle')

        for card in article_cards:
            headline = card.find('span', itemprop='headline').text
            summary = card.find('div', itemprop='articleBody').text
            category = url.split('/')[-1]
            
            articles.append({
                'headline': headline,
                'summary': summary,
                'category': category,
            })

        print("Articles from:", url)
        for article in articles:
            print("Headline:", article['headline'])
            print("Summary:", article['summary'])
            print("="*30)

if __name__ == "__main__":
    urls = [
        "https://inshorts.com/en/read/business",
        "https://inshorts.com/en/read/sports",
        "https://inshorts.com/en/read/technology",
        "https://inshorts.com/en/read/entertainment"
    ]
    scrape_inshorts_articles(urls)


Articles from: https://inshorts.com/en/read/business
Headline: EX-WSJ reporter ends Indian hacker lawsuit against Dechert
Summary: Former Wall Street Journal reporter Jay Solomon ended his lawsuit against Dechert and two of its former partners. Solomon had sued Dechert and others last year, alleging that the firm worked with hackers from India to steal emails between him and one of his sources. Solomon alleged the messages were later circulated in a successful effort to get him fired.

Headline: Fitch warns it may be forced to downgrade several US banks: Report
Summary: A Fitch Ratings' analyst warned that US banks, including JPMorgan Chase, could be downgraded if the agency further cuts its assessment of the industry's operating environment, according to a report from CNBC. In June, Fitch lowered the score of the US banking industry's "operating environment" to AA- from AA, citing pressure on the country's credit rating, gaps in regulatory framework.
Headline: Govt hikes windfall tax 

# Now create a function that will use the previous two functions to scrape the articles from all the pages that you need, and do any additional processing that needs to be done.
### Bonus: cache the data

In [10]:
def scrape_inshorts_articles(urls):
    articles = []

    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        article_cards = soup.find_all(itemtype='http://schema.org/NewsArticle')

        for card in article_cards:
            headline = card.find('span', itemprop='headline').text
            summary = card.find('div', itemprop='articleBody').text
            category = url.split('/')[-1]

            articles.append({
                'headline': headline,
                'summary': summary,
                'category': category
            })

    df = pd.DataFrame(articles)
    return df


    base_url = "https://inshorts.com/en/read/"
    pages = [
        "business",
        "sports",
        "technology",
        "entertainment",
        "india",
        "politics",
        "startups",
        "hatke",
        "international",
        "automobile",
        "science",
        "travel",
        "miscellaneous",
        "fashion",
        "education",    
    ]      

    urls = [base_url + page for page in pages]
    result_df = scrape_inshorts_articles(urls)
    


In [11]:
result_df

NameError: name 'result_df' is not defined

In [None]:
result_df.to_csv("inshorts_articles.csv", index=False)

# Write your code such that the acquired data is saved locally in some form or fashion. Your functions that retrieve the data should prefer to read the local data instead of having to make all the requests everytime the function is called. Include a boolean flag in the functions to allow the data to be acquired "fresh" from the actual sources (re-writing your local cache).