In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

## Lesson Notes

In [2]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# soup

In [5]:
soup.title.text

'Codeup’s Data Science Career Accelerator is Here! - Codeup'

In [6]:
soup.find('div', itemprop='text').text

'The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.Data Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.Our program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Students wi

In [7]:
title = soup.find('h1', itemprop='headline').text
print(title)

Codeup’s Data Science Career Accelerator is Here!


In [8]:
text = soup.find('div', itemprop='text').text
print(text[:250])

The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in Gla


In [9]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

def get_blog_articles(urls, cache=False):
    '''
    This function takes in a list of Codeup Blog urls and a parameter
    with default cache == False which returns a df from a csv file.
    If cache == True, the function scrapes the title and text for each url, 
    creates a list of dictionaries with the title and text for each blog, 
    converts list to df, and returns df.
    '''
    if cache == False:
        df = pd.read_csv('big_blogs.csv', index_col=0)
    else:
        headers = {'User-Agent': 'Codeup Bayes Data Science'} 

        # Create an empty list to hold dictionaries
        articles = []

        # Loop through each url in our list of urls
        for url in urls:

            # get request to each url saved in response
            response = get(url, headers=headers)

            # Create soup object from response text and parse
            soup = BeautifulSoup(response.text, 'html.parser')

            # Save the title of each blog in variable title
            title = soup.find('h1', itemprop='headline').text

            # Save the text in each blog to variable text
            text = soup.find('div', itemprop='text').text

            # Create a dictionary holding the title and text for each blog
            article = {'title': title, 'content': text}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to csv file for faster access
        df.to_csv('big_blogs.csv')
    
    return df

In [10]:
blogs = get_blog_articles(urls=urls, cache=True)
blogs

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie GiustData Scien...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeup launched..."
3,10 Tips to Crush It at the SA Tech Job Fair,10 Tips to Crush It at the SA Tech Job FairSA ...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [11]:
# I'm going to hit Codeup's main blog page to scrape the urls

url = 'https://codeup.com/resources/#blog'
headers = {'User-Agent': 'Codeup Data Science'} 

# Request the HTML
response = get(url, headers=headers)

# Create the soup object to parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
link_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
link_list[:2]

[<a class="jet-listing-dynamic-link__link" href="https://codeup.com/bootcamp-to-bootcamp/"><span class="jet-listing-dynamic-link__label">Read More</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/how-to-get-started-on-a-programming-exercise/"><span class="jet-listing-dynamic-link__label">Read More</span></a>]

In [13]:
# Create empty urls list and for each tag above, grab the href/link
# Add each link to the urls list

urls = []
for link in link_list:
    urls.append(link['href'])

In [14]:
def get_all_urls():
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    # The main Codeup blog page with all the urls
    url = 'https://codeup.com/resources/#blog'
    
    headers = {'User-Agent': 'Codeup Data Science'} 
    
    # Send request to main page and get response
    response = get(url, headers=headers)
    
    # Create soup object using response
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Create empty list to hold the urls for all blogs
    urls = []
    
    # Create a list of the element tags that hold the href/links
    link_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
    
    # get the href/link from each element tag in my list
    for link in link_list:
        
        # Add the link to my urls list
        urls.append(link['href'])
        
    return urls

In [15]:
big_blogs = get_blog_articles(urls=get_all_urls(), cache=True)
big_blogs.head()

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...


## Part 2

In [16]:
url = 'https://inshorts.com/en/read/entertainment'

response = get(url)
response.ok

True

In [17]:
soup = BeautifulSoup(response.text, 'html.parser')

In [18]:
# Scrape a ResultSet of all the news cards on the page and look at first card

cards = soup.find_all('div', class_='news-card')
print(type(cards))
cards[0]

<class 'bs4.element.ResultSet'>


<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/preksha-was-upset-over-lockdown-extension-she-didnt-like-sitting-idle-father-1590681611617" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Daisy Mowke" itemprop="name"></span>
</span>
<span content="Preksha was upset over lockdown extension, she didn't like sitting idle: Father" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2020/05_may/28_thu/img_1590679795346_88.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organization">
<span con

In [19]:
# Save the title of each news card to list titles

titles = []
for card in cards:
    title = card.find('span', itemprop='headline').text
    titles.append(title)
    
titles[:5]

["Preksha was upset over lockdown extension, she didn't like sitting idle: Father",
 'Ravi Mohan Saini, who won ₹1 cr in KBC Junior at 14, becomes SP of Porbandar',
 "Didn't wear this to 'hide my curves': Maanvi on her pic used for clothing store's ad",
 'FIR against Sa Re Ga Ma Pa runner-up from B’desh for remarks against PM Modi',
 "COVID-19 doesn't stop periods: Akshay urges fans to donate sanitary pads"]

In [20]:
# Save the author of the news card to list authors

authors = []
for card in cards:
    author = card.find('span', class_='author').text
    authors.append(author)
    
authors[:5]


['Daisy Mowke', 'Pragya Swastik', 'Daisy Mowke', 'Apaar Sharma', 'Atul Mishra']

In [21]:
# Save the text of each article to a list of texts

texts = []
for card in cards:
    text = card.find('div', itemprop='articleBody').text
    texts.append(text)
    
texts[:2]

['Actress Preksha Mehta\'s father, while speaking about the suicide of his 25-year-old daughter, said, "Preksha used to remain restless as...shoots had halted due to lockdown." "She was upset over...extension of...lockdown...She never liked sitting idle," he added. "I [told] her...she shouldn\'t be worried as it\'s for everyone. We had no idea she\'ll take such a drastic step," her father stated.',
 "IPS officer Ravi Mohan Saini, who won ₹1 crore in KBC Junior when he was 14 years old, took charge as Superintendent of Police, Porbandar, Gujarat on Tuesday. Saini, who is now 33 years old, qualified for Indian Police Service in 2014 with AIR 461. A native of Rajasthan's Alwar, Saini is the son of a retired Navy officer."]

In [22]:
# Create an empty list, articles, to hold the dictionaries for each article
articles = []

# Loop through each news card on the page and get what we want
for card in cards:
    title = card.find('span', itemprop='headline' ).text
    author = card.find('span', class_='author').text
    content = card.find('div', itemprop='articleBody').text
    
    # Create a dictionary, article, for each news card
    article = {'title': title, 'author': author, 'content': content}
    
    # Add the dictionary, article, to our list of dictionaries, articles.
    articles.append(article)

In [23]:
# Here we see our list contains 24-25 dictionaries for news cards

print(len(articles))
articles[:2]

25


[{'title': "Preksha was upset over lockdown extension, she didn't like sitting idle: Father",
  'author': 'Daisy Mowke',
  'content': 'Actress Preksha Mehta\'s father, while speaking about the suicide of his 25-year-old daughter, said, "Preksha used to remain restless as...shoots had halted due to lockdown." "She was upset over...extension of...lockdown...She never liked sitting idle," he added. "I [told] her...she shouldn\'t be worried as it\'s for everyone. We had no idea she\'ll take such a drastic step," her father stated.'},
 {'title': 'Ravi Mohan Saini, who won ₹1 cr in KBC Junior at 14, becomes SP of Porbandar',
  'author': 'Pragya Swastik',
  'content': "IPS officer Ravi Mohan Saini, who won ₹1 crore in KBC Junior when he was 14 years old, took charge as Superintendent of Police, Porbandar, Gujarat on Tuesday. Saini, who is now 33 years old, qualified for Indian Police Service in 2014 with AIR 461. A native of Rajasthan's Alwar, Saini is the son of a retired Navy officer."}]

In [24]:
def get_news_articles(cache=False):
    '''
    This function uses a cache parameter with default cache == False to give the option of 
    returning in a df of inshorts topics and info by reading a csv file or
    of doing a fresh scrape of inshort pages with topics business, sports, technology,
    and entertainment and writing the returned df to a csv file.
    '''
    # default to read in a csv instead of scrape for df
    if cache == False:
        df = pd.read_csv('articles.csv', index_col=0)
        
    # cache == True completes a fresh scrape for df    
    else:
    
        # Set base_url and headers that will be used in get request

        base_url = 'https://inshorts.com/en/read/'
        headers = {'User-Agent': 'Codeup Data Science'}
        
        # List of topics to scrape
        topics = ['business', 'sports', 'technology', 'entertainment']

        # Create an empty list, articles, to hold our dictionaries
        articles = []

        for topic in topics:

            # Get a response object from the main inshorts page
            response = get(base_url + topic, headers=headers)

            # Create soup object using response from inshort
            soup = BeautifulSoup(response.text, 'html.parser')

            # Scrape a ResultSet of all the news cards on the page
            cards = soup.find_all('div', class_='news-card')

            # Loop through each news card on the page and get what we want
            for card in cards:
                title = card.find('span', itemprop='headline' ).text
                author = card.find('span', class_='author').text
                content = card.find('div', itemprop='articleBody').text

                # Create a dictionary, article, for each news card
                article = ({'topic': topic, 
                            'title': title, 
                            'author': author, 
                            'content': content})

                # Add the dictionary, article, to our list of dictionaries, articles.
                articles.append(article)
            
        # Why not return it as a DataFrame?!
        df = pd.DataFrame(articles)
        
        # Write df to csv for future use
        df.to_csv('articles.csv')
    
    return df

In [25]:
# Test our function with cache == True to do a freash scrape and write to `articles.csv`

df = get_news_articles(cache=True)
df.head()

Unnamed: 0,topic,title,author,content
0,business,"Twitter CEO donates $10M to project giving $1,...",Pragya Swastik,Twitter's billionaire CEO Jack Dorsey has dona...
1,business,US firm buys Serum Institute parent's Czech un...,Krishna Veera Vanamali,US biotech firm Novavax has announced it's buy...
2,business,Microsoft in talks to buy 2.5% stake in Jio fo...,Anushka Dixit,Microsoft is in talks with Mukesh Ambani-led R...
3,business,Google in talks to buy 5% stake in Vodafone Id...,Krishna Veera Vanamali,Google is exploring an investment in Vodafone ...
4,business,25-year-old Anant Ambani joins $65 billion Jio...,Krishna Veera Vanamali,Asia's richest person Mukesh Ambani's 25-year-...


In [26]:
df = get_news_articles(cache=False)
df.head()

Unnamed: 0,topic,title,author,content
0,business,"Twitter CEO donates $10M to project giving $1,...",Pragya Swastik,Twitter's billionaire CEO Jack Dorsey has dona...
1,business,US firm buys Serum Institute parent's Czech un...,Krishna Veera Vanamali,US biotech firm Novavax has announced it's buy...
2,business,Microsoft in talks to buy 2.5% stake in Jio fo...,Anushka Dixit,Microsoft is in talks with Mukesh Ambani-led R...
3,business,Google in talks to buy 5% stake in Vodafone Id...,Krishna Veera Vanamali,Google is exploring an investment in Vodafone ...
4,business,25-year-old Anant Ambani joins $65 billion Jio...,Krishna Veera Vanamali,Asia's richest person Mukesh Ambani's 25-year-...
