## Intro to Web Scraping with Python


### 0. Set Up 

In [30]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup  #main webscraping package 
import requests # allows you to send https requests 
import pandas as pd # for manipulating data into data frames 
import re # regular expressions package 

Example) Jeopardy Archive Website :  https://j-archive.com/

### 1) Capture and review webpage

In [31]:
# Set Practice Link
jeopardy_archive_link = "http://www.j-archive.com/showgame.php?game_id=6623"
# Get the requested page 
page_response = requests.get(jeopardy_archive_link)

In [32]:
page_response

<Response [200]>

In [33]:
# review response 
page_response.status_code

# What do these codes mean ?  See here: https://www.restapitutorial.com/httpstatuscodes.html
# 200 = Success! 
# 400 = Client Error 

# page_response.raise_for_status()

200

In [34]:
# extract content 
page_content = BeautifulSoup(page_response.content, "html.parser")
#page_content

In [35]:
# use prettify to make output more readable 
#print(page_content.prettify())

### 2) Review main web components 

In [36]:
# Review title contents 
page_content.find_all('title')

[<title>J! Archive - Show #8215, aired 2020-05-01</title>]

In [37]:
# Also use the id feature to pull out these items 
page_content.find_all(id = 'game_title')

[<div id="game_title"><h1>Show #8215 - Friday, May 1, 2020</h1></div>]

In [38]:
# Review paragraph contents 
page_content.find_all("p")

[<p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13288" rel="external">Quemars Ahmed</a>, a field organizer from La CaÃ±ada Flintridge, California</p>,
 <p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13289" rel="external">Ashleigh McCord</a>, a marine resource management specialist from Beverly, Massachusetts</p>,
 <p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13286" rel="external">Jesse Laymon</a>, a public policy director from Long Island City, New York (whose 1-day cash winnings total $13,000)</p>,
 <p><a href="wageringcalculator.php?a=15400&amp;b=15000&amp;c=10600&amp;player_a=Jesse&amp;player_b=Quemars&amp;player_c=Ashleigh">[wagering suggestions for these scores]</a></p>]

In [39]:
# can specify on the class of the page content
page_content.find_all("p", class_ = 'contestants') 

[<p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13288" rel="external">Quemars Ahmed</a>, a field organizer from La CaÃ±ada Flintridge, California</p>,
 <p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13289" rel="external">Ashleigh McCord</a>, a marine resource management specialist from Beverly, Massachusetts</p>,
 <p class="contestants"><a href="http://www.j-archive.com/showplayer.php?player_id=13286" rel="external">Jesse Laymon</a>, a public policy director from Long Island City, New York (whose 1-day cash winnings total $13,000)</p>]

In [40]:
# Review table contents 
elems = page_content.find_all(lambda tag: tag.name == 'td')

# Another way to view this unstructured data 
#for elem in elems:
#    print(elem, end='\n'*2)


In [41]:
# just review scores 
page_content.find_all(lambda tag: tag.get('class') == ['score_positive'])

[<td class="score_positive">$5,600</td>,
 <td class="score_positive">$200</td>,
 <td class="score_positive">$9,400</td>,
 <td class="score_positive">$3,400</td>,
 <td class="score_positive">$15,400</td>,
 <td class="score_positive">$10,600</td>,
 <td class="score_positive">$15,000</td>,
 <td class="score_positive">$30,400</td>,
 <td class="score_positive">$8,800</td>,
 <td class="score_positive">$0</td>,
 <td class="score_positive">$14,400</td>,
 <td class="score_positive">$10,600</td>,
 <td class="score_positive">$13,400</td>]

### 3) Find next page link 

In [42]:
# Review hyperlinks <a> tags 
page_content.find_all(lambda tag: tag.name == 'a')[0:20]

[<a href="http://www.j-archive.com"><img alt="J! Archive" height="22" src="http://www.j-archive.com/j-a.gif" width="100"/></a>,
 <a href="http://www.j-archive.com/showseason.php?season=36">[current season]</a>,
 <a href="http://www.j-archive.com/showseason.php?season=35">[last season]</a>,
 <a href="http://www.j-archive.com/listseasons.php">[all seasons]</a>,
 <a href="http://www.j-archive.com/listprizes.php">[prizes]</a>,
 <a href="http://www.j-archive.com/wageringcalculator.php">[wagering calculator]</a>,
 <a href="http://www.j-archive.com/help.php">[help]</a>,
 <a href="showgame.php?game_id=6622">[&lt;&lt; previous game]</a>,
 <a href="http://www.j-archive.com/showplayer.php?player_id=13288" rel="external">Quemars Ahmed</a>,
 <a href="http://www.j-archive.com/showplayer.php?player_id=13289" rel="external">Ashleigh McCord</a>,
 <a href="http://www.j-archive.com/showplayer.php?player_id=13286" rel="external">Jesse Laymon</a>,
 <a href="showgame.php?game_id=6651">[next game &gt;&gt;]</

In [43]:
# Can specify on the text of the page contents 
    # <a href="showgame.php?game_id=6158">[&lt;&lt; previous game]</a>
page_content.find_all(lambda tag: tag.name == 'a' and tag.text == "[<< previous game]")


[<a href="showgame.php?game_id=6622">[&lt;&lt; previous game]</a>]

In [44]:
new_game_link = page_content.find_all(lambda tag: tag.name == 'a' and 
                                                    tag.get('href') and 
                                                    tag.text == "[<< previous game]")
print(new_game_link)

[<a href="showgame.php?game_id=6622">[&lt;&lt; previous game]</a>]


In [45]:
#[<a href="showgame.php?game_id=6388">[&lt;&lt; previous game]</a>]
# WE WANT THE GAME ID ----------^^^^

# USE REGEX TO CAPTURE JUST DIGITS, and extract to a string variable [0]
re.findall(r'\d+', str(new_game_link[0]))[0] 


'6622'

In [46]:
new_game_id = re.findall(r'\d+', str(new_game_link[0]))[0]

# create next page link 
updated_jeopardy_archive_link = "http://www.j-archive.com/showgame.php?game_id=" + str(new_game_id)
updated_jeopardy_archive_link

'http://www.j-archive.com/showgame.php?game_id=6622'

### Full Web Scraping Process
Start with a recent game and move backwards through time ( Game ID 6623 OR 6389 )
http://www.j-archive.com/showgame.php?game_id=6623

In [47]:
# Define Function that sets a game id and number of pages to extract from the jeopardy archive 
def jep_scraper(game_id = 6389, num_pages_to_extract = 300):
    
    # initialize variables 
    index = 0
    new_game_id = 0
    
    output = []
    error_catalog = []
    
    archive_link = "http://www.j-archive.com/showgame.php?game_id="
    jeopardy_archive_link = archive_link + str(game_id)
    
    while index < num_pages_to_extract:
        # pull page
        page_response = requests.get(jeopardy_archive_link, timeout=5) # get page, timeout set to 5 seconds 
    
        # confirm no error with pull 
        try: 
            page_response.raise_for_status()
            pass
        except:
            print("HTML ERROR CODE: " + str(page_response.status_code))
    
        page_content = BeautifulSoup(page_response.content, "html.parser")
    
        # create empty variables to fill 
        anecdotes = []
        final_scores = []
        names = []
        show_info1 = []
        show_info2 = []
        show_info3 = []
    
        #title date 
        title_date = page_content.find_all('title')[0].text # clean to just date (?)
    
        # iterate through three contestants
        for j in range(0, 3):
            #Find all anecdotes for contestants 
            paragraphs = page_content.find_all("p")[j].text
        
            # Final all final scores for contestants 
            try:
                table1 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                           
                                           (tag.get('class') == ['score_positive'] 
                                            or
                                            tag.get('class') == ['score_negative'] # some scores can be negative!! See 5913
                                           )
                                          )[9:12][j].text
                pass
        
            except IndexError:
                print("Error 1 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
            #extract player names and remarks 
            try:
                table2 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['score_player_nickname'])[j].text
                pass
        
            except IndexError:
                print("Error 2 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
            try:
                table3 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['score_remarks'])[j].text
                pass
        
            except IndexError:
                print("Error 3 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
        
            #append these player details together
            anecdotes.append(paragraphs)
            final_scores.append(table1)
            names.append(table2)
            

        # reorder and correct data (anecdotes go 2 1 0 / names and scores go 0 1 2 )
        show_info1.extend([names[0],anecdotes[2],final_scores[0],title_date])
        show_info2.extend([names[1],anecdotes[1],final_scores[1],title_date])
        show_info3.extend([names[2],anecdotes[0],final_scores[2],title_date])
    
        #add to final output file
        output.append(show_info1)
        output.append(show_info2)
        output.append(show_info3)
    
        #create link to next page
        #create previous page number
        new_game_id = page_content.find_all(lambda tag: tag.name == 'a' and 
                                                    tag.get('href') and 
                                                    tag.text == "[<< previous game]")
        #print(new_game_id)
        #[<a href="showgame.php?game_id=6388">[&lt;&lt; previous game]</a>]
        # WE WANT THE GAME ID ----------^^^^

        new_game_id = re.findall(r'\d+', str(new_game_id[0]))[0] # USE REGEX TO CAPTURE JUST DIGITS 

        # create next page link 
        jeopardy_archive_link = archive_link + new_game_id
        jeopardy_archive_link
    
        #update iterator 
        index = index + 1 

    #return items 
    return output, error_catalog 


In [48]:
# Run the scraper on the predefined results 
output_table,error_table = jep_scraper(game_id = 6389, num_pages_to_extract = 300)

Error 1 @ Game ID:6227
Error 1 @ Game ID:6227
Error 1 @ Game ID:6227
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6224
Error 1 @ Game ID:6224
Error 1 @ Game ID:6224
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223


In [55]:
#print(output_table)

In [50]:
#print(error_table)

['http://www.j-archive.com/showgame.php?game_id=6227', 'http://www.j-archive.com/showgame.php?game_id=6227', 'http://www.j-archive.com/showgame.php?game_id=6227', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6226', 'http://www.j-archive.com/showgame.php?game_id=6224', 'http://www.j-archive.com/showgame.php?game_id=6224', 'http://www.j-archive.com/showgame.php?game_id=6224', 'http://www.j-archive.com/showgame.php?game_id=6223', 'http://www.j-archive.com/showgame.php?game_id=6223', 'http://www.j-archive.com/showgame.php?game_id=6223', 'http://www.j-archive.com/showgame.php?game_id=6223', 'http://www.j-archive.com/showgame.php?game_id=6223', 'http://www.j-archive.com/showgame.php?game_id=6223']


In [51]:
#Convert output to a dataframe and prepare for initial cleaning 
archive = pd.DataFrame.from_records(output_table)
archive.columns = ["nickname", "player_details", "final_score", "show_info"]

# Initialize Field Name
archive["full_name"] = ""
archive["occupation"] = ""
archive["hometown"] = ""
archive["archive_info"] = ""
archive["date"] = ""

In [52]:
archive["player_details"]

0      Jason Zuffranieri, a math teacher from Albuque...
1      Maggie Lehrman, an editor and writer from Broo...
2      Michael Riggs, an educational therapist from T...
3      Jason Zuffranieri, a math teacher from Albuque...
4      Eric Kaplan, a retired OB-GYN physician from L...
                             ...                        
895    Randy Mathews, a fragrance specialist from Hou...
896    Alyssa Abel, a bookseller from Waupaca, Wisconsin
897    Kristin Robbins, an attorney from Red Bank, Ne...
898    Kate Jovin, a social worker from Somerville, M...
899    Rex Moroux, a commercial real estate broker fr...
Name: player_details, Length: 900, dtype: object

In [53]:
# Take the archive details and split them into usable contents! 
for i in range(0,len(archive)):
    
    # extract player details 
    archive.at[i,"full_name"] = archive["player_details"].iloc[i].split(",")[0]
    archive.at[i,"occupation"] = archive["player_details"].iloc[i].split(",")[1].split("from")[0].strip()
    archive.at[i,"hometown"] = archive["player_details"].iloc[i].split("from")[1].split("(")[0].strip()
        
    # extra show info 
    archive.at[i,"archive_info"] = archive["show_info"].iloc[i].split(", aired")[0].strip()
    archive.at[i,"date"] = archive["show_info"].iloc[i].split(", aired")[1].strip()


In [54]:
archive.head(10)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25
4,Eric,"Eric Kaplan, a retired OB-GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB-GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25
5,Shari,"Shari Meyer, a high school English teacher fro...",$0,"J! Archive - Show #8044, aired 2019-07-25",Shari Meyer,a high school English teacher,"Somerville, Massachusetts",J! Archive - Show #8044,2019-07-25
6,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,000","J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24
7,Michelle,"Michelle Bruck, an attorney from Levittown, Pe...","$3,000","J! Archive - Show #8043, aired 2019-07-24",Michelle Bruck,an attorney,"Levittown, Pennsylvania",J! Archive - Show #8043,2019-07-24
8,Corin,"Corin Purifoy, a fiber artist from Milwaukee, ...","$6,100","J! Archive - Show #8043, aired 2019-07-24",Corin Purifoy,a fiber artist,"Milwaukee, Wisconsin",J! Archive - Show #8043,2019-07-24
9,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,100","J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23


In [None]:
# Clean date
archive['date'].replace(regex=True,inplace=True,to_replace=r':',value=r'')
archive['date'] = pd.to_datetime(archive['date'], errors='coerce')
archive.head(5)

In [None]:
# Save your data
# archive.to_csv('../Data/jeopardy_archive_data.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path!