## 001_mk_archive_data
### Create Jeopardy Archive Data 
### James Wilson

In [4]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

### Scrape online archive data

In [5]:
# Define Function that sets a game id and number of pages to extract from the jeopardy archive 
def jep_scraper(game_id = 6389, num_pages_to_extract = 300, error_time = 5):
    
    # initialize variables 
    index = 0
    new_game_id = 0
    
    output = []
    error_catalog = []
    
    archive_link = "http://www.j-archive.com/showgame.php?game_id="
    jeopardy_archive_link = archive_link + str(game_id)
    
    while index < num_pages_to_extract:
        # pull page
        page_response = requests.get(jeopardy_archive_link, timeout = error_time) # get page, timeout set to 10 seconds 
    
        # confirm no error with pull  
        try: 
            page_response.raise_for_status()
            pass
#         except requests.exceptions.Timeout as e:
#             print("Timeout occurred")
#             print(str(e))
#         except requests.ConnectionError as e:
#             print("Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
#         except requests.RequestException as e:
#             print("General Error")
#             print(str(e))
#         except KeyboardInterrupt:
#             print("Someone closed the program")
        except:
            print("HTML ERROR CODE: " + str(page_response.status_code))
    
        page_content = BeautifulSoup(page_response.content, "html.parser")
    
        # create empty variables to fill 
        anecdotes = []
        final_scores = []
        names = []
        show_info1 = []
        show_info2 = []
        show_info3 = []
    
        #title date 
        title_date = page_content.find_all('title')[0].text # clean to just date (?)
    
        # iterate through three contestants
        for j in range(0, 3):
            #Find all anecdotes for contestants 
            paragraphs = page_content.find_all("p")[j].text
        
            # Final all final scores for contestants 
            try:
                table1 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                           
                                           (tag.get('class') == ['score_positive'] 
                                            or
                                            tag.get('class') == ['score_negative'] # some scores can be negative!! See 5913
                                           )
                                          )[9:12][j].text
                pass
        
            except IndexError:
                print("Error 1 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
            #extract player names and remarks 
            try:
                table2 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['score_player_nickname'])[j].text
                pass
        
            except IndexError:
                print("Error 2 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
            try:
                table3 = page_content.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['score_remarks'])[j].text
                pass
        
            except IndexError:
                print("Error 3 @ Game ID:" + str(new_game_id)) # error output - used to locate mismatched fields 
                error_catalog.append(jeopardy_archive_link)  # store links from archive (typically special events)
        
        
            #append these player details together
            anecdotes.append(paragraphs)
            final_scores.append(table1)
            names.append(table2)
            

        # reorder and correct data (anecdotes go 2 1 0 / names and scores go 0 1 2 )
        show_info1.extend([names[0],anecdotes[2],final_scores[0],title_date])
        show_info2.extend([names[1],anecdotes[1],final_scores[1],title_date])
        show_info3.extend([names[2],anecdotes[0],final_scores[2],title_date])
    
        #add to final output file
        output.append(show_info1)
        output.append(show_info2)
        output.append(show_info3)
    
        #create link to next page
        #create previous page number
        new_game_id = page_content.find_all(lambda tag: tag.name == 'a' and 
                                                    tag.get('href') and 
                                                    tag.text == "[<< previous game]")
        #print(new_game_id)
        #[<a href="showgame.php?game_id=6388">[&lt;&lt; previous game]</a>]
        # WE WANT THE GAME ID ----------^^^^

        new_game_id = re.findall(r'\d+', str(new_game_id[0]))[0] # USE REGEX TO CAPTURE JUST DIGITS 

        # create next page link 
        jeopardy_archive_link = archive_link + new_game_id
        jeopardy_archive_link
    
        #update iterator 
        index = index + 1 

    #return items 
    return output, error_catalog 


In [6]:
# Run the scraper on the predefined results 
output_table,error_table = jep_scraper(game_id = 6389, num_pages_to_extract = 3000, error_time = 30)

Error 1 @ Game ID:6227
Error 1 @ Game ID:6227
Error 1 @ Game ID:6227
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6226
Error 3 @ Game ID:6226
Error 1 @ Game ID:6224
Error 1 @ Game ID:6224
Error 1 @ Game ID:6224
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223
Error 1 @ Game ID:6223
Error 3 @ Game ID:6223
Error 1 @ Game ID:3576
Error 1 @ Game ID:3576
Error 1 @ Game ID:3576
Error 1 @ Game ID:3575
Error 3 @ Game ID:3575
Error 1 @ Game ID:3575
Error 3 @ Game ID:3575
Error 1 @ Game ID:3575
Error 3 @ Game ID:3575


In [7]:
#Convert output to a dataframe and prepare for initial cleaning 
archive = pd.DataFrame.from_records(output_table)
archive.columns = ["nickname", "player_details", "final_score", "show_info"]

# Initialize Field Name
archive["full_name"] = ""
archive["occupation"] = ""
archive["hometown"] = ""
archive["archive_info"] = ""
archive["date"] = ""

In [8]:
archive["player_details"]

0       Jason Zuffranieri, a math teacher from Albuque...
1       Maggie Lehrman, an editor and writer from Broo...
2       Michael Riggs, an educational therapist from T...
3       Jason Zuffranieri, a math teacher from Albuque...
4       Eric Kaplan, a retired OB/GYN physician from L...
                              ...                        
8995    Ann Rupel, a medical research assistant from N...
8996    Justin Budinoff, an administrative coordinator...
8997    Bruce Lin, a research scientist originally fro...
8998    Jennifer Laam, a teacher from Studio City, Cal...
8999      Scott Ahearn, an actor from the Bronx, New York
Name: player_details, Length: 9000, dtype: object

In [9]:
# Take the archive details and split them into usable contents! 
for i in range(0,len(archive)):
    
    # extract player details 
    archive.at[i,"full_name"] = archive["player_details"].iloc[i].split(",")[0]
    archive.at[i,"occupation"] = archive["player_details"].iloc[i].split(",")[1].split("from")[0].strip()
    archive.at[i,"hometown"] = archive["player_details"].iloc[i].split("from")[1].split("(")[0].strip()
        
    # extra show info 
    archive.at[i,"archive_info"] = archive["show_info"].iloc[i].split(", aired")[0].strip()
    archive.at[i,"date"] = archive["show_info"].iloc[i].split(", aired")[1].strip()


In [10]:
archive.head(5)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25
4,Eric,"Eric Kaplan, a retired OB/GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB/GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25


In [11]:
# Clean date
archive['date'].replace(regex=True,inplace=True,to_replace=r':',value=r'')
archive['date'] = pd.to_datetime(archive['date'], errors='coerce')
archive.head(5)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25
4,Eric,"Eric Kaplan, a retired OB/GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB/GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25


In [13]:
archive.shape

(9000, 9)

In [12]:
archive.to_csv('../data/raw/jeopardy_archive_data20210127.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path