In [74]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
from time import strftime

### 1. Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

>{<br>  
    'title': 'the title of the article',  
    'content': 'the full text content of the article'<br>  
}

Plus any additional properties you think might be helpful.

<b>Bonus: Scrape the text of all the articles linked on `codeup's blog page`.</b>



In [2]:
# The web page that will be scraped:
url = 'https://codeup.com/blog/'

In [3]:
# Some websites don't accept the python-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science'} 

In [4]:
# Requesting page information with the header so I don't get rejected
response = get(url, headers=headers)

In [5]:
# Turning page information into chunks of soup
soup = BeautifulSoup(response.text)

In [6]:
# The blog links I'm after are under a class called "entry-title", 
# so I'll look for those specifically
soup.select('.entry-title')[0]

<h2 class="entry-title"><a href="https://codeup.com/codeup-news/codeup-start-dates-for-march-2022/">Codeup Start Dates for March 2022</a></h2>

In [7]:
# This is how I will pull out just the url 
soup.select('.entry-title')[0].a['href']

'https://codeup.com/codeup-news/codeup-start-dates-for-march-2022/'

In [8]:
# Since I can't separate the href enmasse I will needed to do it individually
titles = soup.select('.entry-title')

links= []
for title in titles:
    links.append(title.a["href"])

links

['https://codeup.com/codeup-news/codeup-start-dates-for-march-2022/',
 'https://codeup.com/codeup-news/vet-tec-funding-dallas/',
 'https://codeup.com/codeup-news/dallas-campus-re-opens-with-new-grant-partner/',
 'https://codeup.com/dallas-newsletter/codeup-dallas-open-house/',
 'https://codeup.com/codeup-news/codeups-placement-team-continues-setting-records/',
 'https://codeup.com/it-training/it-certifications-101/',
 'https://codeup.com/cybersecurity/a-rise-in-cyber-attacks-means-opportunities-for-veterans-in-san-antonio/',
 'https://codeup.com/codeup-news/use-your-gi-bill-benefits-to-land-a-job-in-tech/',
 'https://codeup.com/tips-for-prospective-students/which-program-is-right-for-me-cyber-security-or-systems-engineering/',
 'https://codeup.com/it-training/what-the-heck-is-system-engineering/',
 'https://codeup.com/alumni-stories/from-speech-pathology-to-business-intelligence/',
 'https://codeup.com/behind-the-billboards/boris-behind-the-billboards/',
 'https://codeup.com/codeup-new

In [56]:
# Now that I have all the links from the main page, I want to get the article content
# I'll start by making a soup of a single article page to see what I need from it
response2 = get(links[0], headers= {'User-Agent': 'Codeup Data Science'})
soup2 = BeautifulSoup(response2.text)

In [57]:
# The title of the Article
soup2.select_one('.entry-title').text

'Codeup Start Dates for March 2022'

In [58]:
# The date the article was published
soup2.select_one('.published').text

'Jan 26, 2022'

In [59]:
# The actual written blog of the article 
soup2.select_one(".entry-content").text.strip()

'As we approach the end of January we wanted to look forward to our next start dates for all of our current programs.\nFull Stack Web Development – 3/7/22\nFull Stack Web Development is the first program we built and also our most popular. You’ve asked and we listened! Our next Web Development cohort will start on 3/7/2022 and is ENTIRELY VIRTUAL! THESE SEATS WILL GO FAST!\nAs one of the most in-demand jobs in the country, software and web development is the tech career with the newest jobs. In the U.S., there’s:\n\n1.5 million developer jobs*\n250,000 of them remain open\na high growth rate of 13%*\n\n\xa0\nData Science – 3/22/22\nOur first new Data Science class of 2022 starts Monday 3/22/2022 at our downtown campus at the Vogue building.\nWhy consider pivoting careers to Data Science?\n\n#1 job in America from 2016-2020 (Glassdoor*)\n650% increase in data science positions since 2012\nNearly 12 million new jobs between 2019 and 2029\n31% ten-year growth rate\n\nThe supply of data sc

In [69]:
# Now to make it into a repeatable function! :)

def front_page_links():
    """
    Short function to hit the codeup blog landing page and return a list of all the urls to further blog posts on the
    page.
    """
    response = get("https://codeup.com/blog/", headers={"user-agent": "Codeup Data Science"})
    soup = BeautifulSoup(response.text)
    links = [link.attrs["href"] for link in soup.select(".more-link")]

    return links

def blog_article(url):
    "Given a blog article url, extract the relevant information and return it as a dictionary."
    response = get(url, headers={"user-agent": "Codeup DS"})
    soup = BeautifulSoup(response.text)
    return {
        # The title of the Article
        "title": soup.select_one(".entry-title").text,
        # The date the article was published
        "published": soup.select_one(".published").text,
        # The actual written blog of the article 
        "content": soup.select_one(".entry-content").text.strip(),
    }

def fp_blogs():
    "Returns a dataframe where each row is a blog post from the front page of the codeup blogs."
    links = front_page_links()
    df = pd.DataFrame([blog_article(link) for link in links])
    return df

In [72]:
df = fp_blogs()
df

Unnamed: 0,title,published,content
0,Codeup Dallas Open House,"Nov 30, 2021",Come join us for the re-opening of our Dallas ...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021",Our Placement Team is simply defined as a grou...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021","AWS, Google, Azure, Red Hat, CompTIA…these are..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021","In the last few months, the US has experienced..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021","As the end of military service gets closer, ma..."
5,Which program is right for me: Cyber Security ...,"Oct 28, 2021",What IT Career should I choose?\nIf you’re thi...
6,What the Heck is System Engineering?,"Oct 21, 2021",Codeup offers a 13-week training program: Syst...
7,From Speech Pathology to Business Intelligence,"Oct 18, 2021","By: Alicia Gonzalez\nBefore Codeup, I was a ho..."
8,Boris – Behind the Billboards,"Oct 3, 2021",
9,Is Codeup the Best Bootcamp in San Antonio…or ...,"Sep 16, 2021",Looking for the best data science bootcamp in ...


In [76]:
# Save the blogs as json:

today = strftime('%Y-%m-%d')
fp_blogs().to_json(f'codeup_blog_{today}.json')

# InShorts

### 2. News Articles
  
We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.
  
Write a function that scrapes the news articles for the following topics:
  
Business
Sports
Technology
Entertainment
The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

>{<br>  
    'title': 'The article title',  
    'content': 'The article content',  
    'category': 'business' # for example<br>  
}

Hints:

- Start by inspecting the website in your browser. Figure out which elements will be useful.  
- Start by creating a function that handles a single article and produces a dictionary like the one above.  
- Next create a function that will find all the articles on a single page and call the function you created in the last step for every article on the page.  
- Now create a function that will use the previous two functions to scrape the articles from all the pages that you need, and do any additional processing that needs to be done.

In [78]:
# Time for more soup!
url = 'https://www.inshorts.com/en/read/business'
response = get(url, headers={'user-agent': 'Codeup DS'})
soup = BeautifulSoup(response.text)

In [80]:
# Identify the class I want to select for each news article
cards = soup.select('.news-card')
len(cards)

25

In [82]:
# Article headline
headline = cards[0].find('span', itemprop = 'headline').text
headline

'RBI cancels licence of Maha-based Independence Co-operative Bank'

In [83]:
# Author of the Article
cards[0].find('span', class_ = 'author').text

'Shalini Ojha'

In [84]:
# Content of the Article
cards[0].find('div', itemprop = 'articleBody').text

"RBI has cancelled licence of Maharashtra-based Independence Co-operative Bank, citing inadequate capital. It will cease to carry on banking operations from the close of business on February 3. In the present situation, the bank won't be able to pay its depositors in full, RBI said. It added that the bank didn't comply with multiple sections of Banking Regulation Act, 1949. "

In [86]:
# Date Article was published
# Since these were all published today, all the dates will be the same
cards[0].find('span', clas ='date').text

'03 Feb 2022,Thursday'

In [89]:
def parse_news_card(card):
    'Given a news card object, returns a dictionary of the relevant information.'
    card_title = card.select_one('.news-card-title')
    output = {}
    output['title'] = card.find('span', itemprop = 'headline').text
    output['author'] = card.find('span', class_ = 'author').text
    output['content'] = card.find('div', itemprop = 'articleBody').text
    output['date'] = card.find('span', clas ='date').text
    return output


def parse_inshorts_page(url):
    '''Given a url, returns a dataframe where each row is a news article from the url.
    Infers the category from the last section of the url.'''
    category = url.split('/')[-1]
    response = get(url, headers={'user-agent': 'Codeup DS'})
    soup = BeautifulSoup(response.text)
    cards = soup.select('.news-card')
    df = pd.DataFrame([parse_news_card(card) for card in cards])
    df['category'] = category
    return df

def get_inshorts_articles():
    '''
    Returns a dataframe of news articles from the business, sports, technology, and entertainment sections of
    inshorts.
    '''
    url = 'https://inshorts.com/en/read/'
    categories = ['business', 'sports', 'technology', 'entertainment']
    df = pd.DataFrame()
    for cat in categories:
        df = pd.concat([df, pd.DataFrame(parse_inshorts_page(url + cat))])
    df = df.reset_index(drop=True)
    return df

In [90]:
df = get_inshorts_articles()
df

Unnamed: 0,title,author,content,date,category
0,RBI cancels licence of Maha-based Independence...,Shalini Ojha,RBI has cancelled licence of Maharashtra-based...,"03 Feb 2022,Thursday",business
1,Boost to EVs a big step: Windmill Capital,Roshan Gupta,"Increased use of EVs in public transport, spec...","03 Feb 2022,Thursday",business
2,Facebook parent Meta's $230-billion wipeout bi...,Pragya Swastik,Facebook's parent Meta's shares plunged 27% an...,"03 Feb 2022,Thursday",business
3,Melinda not to give majority of wealth to Gate...,Aishwarya Awasthi,"Melinda French Gates, Co-founder of the Bill a...","03 Feb 2022,Thursday",business
4,Facebook's daily active users fall for first t...,Pragya Swastik,Facebook has seen its daily active users (DAUs...,"03 Feb 2022,Thursday",business
...,...,...,...,...,...
95,"Told Deepika won't click pic with you, I'll do...",Kriti Kambiri,Actor Dhairya Karwa revealed that he went to D...,"03 Feb 2022,Thursday",entertainment
96,"Miss cleaning utensils, floor in Bigg Boss hou...",Ria Kapoor,Bigg Boss 15 runner-up Pratik Sehajpal says he...,"03 Feb 2022,Thursday",entertainment
97,Priyanka to star opposite Anthony Mackie in ac...,Kriti Kambiri,Actress Priyanka Chopra will be starring oppos...,"03 Feb 2022,Thursday",entertainment
98,I don't work thinking I'm so many films old: D...,Kriti Kambiri,"Actress Deepika Padukone, who made her Bollywo...","03 Feb 2022,Thursday",entertainment


In [91]:
# saving the dataframe as json:
today = strftime('%Y-%m-%d')
get_inshorts_articles().to_json(f'inshorts-{today}.json')