## This is a cleaned up workbook working through the development of scraping functions for Jeopardy games (individual shows).  The .py function file has been updated so the first part is slightly out of date, but can be used to generate the overall pattern of how the acquisition works. 

----

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import requests
import time
from bs4 import BeautifulSoup as bs

import seaborn as sns
import matplotlib.pyplot as plt

### Create Base Soup Object - Single Page Run

In [None]:
r = requests.get('https://j-archive.com/showgame.php?game_id=3576')
soup = bs(r.content )

### Get Show Info

In [None]:
show_info = soup.find('div', attrs = {'id':'game_title'}).text

In [None]:
show_num = re.findall(r'\d+', show_info)[0]
show_yr = re.findall(r'\d+', show_info)[2][2:]
show_mo = re.findall(r'[a-zA-Z]{2,}', show_info)[2]

### Identify Daily Doubles

In [None]:
indexer = 1
dd_list = []
for thing in soup.find_all('td', class_ = 'clue'):
    row = {}
    row['q_index'] = indexer
    indexer += 1
    if thing.find('td', class_ = 'clue_value_daily_double'):
        row['is_DD'] = 1
    else:
        row['is_DD'] = 0
    dd_list.append(row)
df_dd = pd.DataFrame(dd_list)

### Identify Answers and Triple Stumpers

In [None]:
answers = []
counter = 1

for clue in soup.find_all('td', class_ = 'clue'):
    answer = {}
    if counter < 61:
        answer['q_index'] = counter
        ans = clue.find('div', onmouseover = True).get('onmouseover')
        cls = bs(ans)
        answer['answer'] = cls.find('em').text
        if 'Triple Stumper' in ans:
            answer['is_stumper'] = 1
        else:
            answer['is_stumper'] = 0
        answers.append(answer)
        counter += 1
df_ans = pd.DataFrame(answers)

### Create Question List

In [None]:
j_value = ['200','400','600','800','1000'] #should these be ints?
dj_value = ['400','800','1200','1600','2000'] #SHould these be ints?

In [None]:
# Jeopardy Categories
j_category_list = soup.find('div', attrs = {'id':'jeopardy_round'}).find_all('td', attrs = {'class':'category_name'})
j_categories = []
for category in j_category_list:
    j_categories.append(category.get_text())

In [None]:
# Jeopardy:
questions = []

for i in range(6):
    for j in range(5):
        question = {}
        
        question['show_num'] = show_num
        question['show_yr'] = show_yr
        question['show_mo'] = show_mo
        
        question['round'] = 'Jeopardy'
        
        question['category'] = j_categories[i]
         
        question['value'] = j_value[j]
        
        clue = f'clue_J_{i+1}_{j+1}' 
        question['clue'] = soup.find('td', attrs = {'id':clue}).text
        
        question['q_index'] = (i + 1) + (6* j)
        
        questions.append(question)
df_j_qs = pd.DataFrame(questions)

In [None]:
# Double Jeopardy Categories
dj_category_list = soup.find('div', attrs = {'id':'double_jeopardy_round'}).find_all('td', attrs = {'class':'category_name'})
dj_categories = []
for category in dj_category_list:
    dj_categories.append(category.get_text())

In [None]:
# Double Jeopardy
questions = []

for i in range(6):
    for j in range(5):
        question = {}
        question['q_index'] = 30 + (i + 1) + (6* j)
        question['show_num'] = show_num
        question['show_yr'] = show_yr
        question['show_mo'] = show_mo
        
        question['round'] = 'Double Jeopardy'
        
        question['category'] = dj_categories[i]
         
        question['value'] = dj_value[j]
        
        clue = f'clue_DJ_{i+1}_{j+1}' 
        try:
                question['clue'] = soup.find('td', attrs = {'id':clue}).text
        except:
            continue        
        
        questions.append(question)
df_dj_qs = pd.DataFrame(questions)

In [None]:
# Final Jeopardy

#q_index is unnecessary as this final jeopardy will be assigned after other dfs are combined

fj = soup.find('table', class_ = 'final_round')

f_j = {}
f_j['show_num'] = show_num
f_j['show_yr'] = show_yr
f_j['show_mo'] = show_mo
f_j['round'] = 'Final Jeopardy'
f_j['value'] = 'FJ'
f_j['is_DD'] = 0

f_j['category'] = fj.find('td', class_ = 'category_name').text

f_j['clue'] = fj.find('td', attrs = {'id':'clue_FJ'}).text

ans = fj.find('div', onmouseover = True).get('onmouseover')
cls = bs(ans)
f_j['answer'] = cls.find('em').text
if 'Triple Stumper' in ans:
    f_j['is_stumper'] = 1
else:
    f_j['is_stumper'] = 0

df_fj = pd.DataFrame([f_j])

### Combine Dataframes

In [None]:
df_js = pd.concat([df_j_qs, df_dj_qs]).reset_index()

In [None]:
df_game = df_js.merge(df_dd, on='q_index').merge(df_ans, on = 'q_index').drop(columns=['index','q_index'])
df_game = pd.concat([df_game,df_fj]).reset_index()
df_game

### Add Level (Difficulty)

In [None]:
df_game['level'] =  np.where(df_game['is_DD'] == 1, 4, \
                    np.where(df_game['is_stumper'] == 1, 5, \
                    np.where(df_game['round'] == 'Final Jeopardy', 6, \
                    np.where((df_game['value'] == '200') | (df_game['value'] == '400'), 1, \
                    np.where((df_game['value'] == '600') | (df_game['value'] == '800') | (df_game['value'] == '1200'), 2, \
                    np.where((df_game['value'] == '1000') | (df_game['value'] == '1600') | (df_game['value'] == '2000'), 3, 'X''X')))))) 

In [None]:
df_game.to_csv('watson_g1_j_round.csv')

---------

# Functions

In [None]:
import acquire

In [None]:
test_url = 'https://j-archive.com/showgame.php?game_id=6833'

#### Testing Daily Doubles

In [None]:
df_dd = acquire.is_dd(test_url)
df_dd

*Can add error finders and messangers in here in the future, using things like ensuring DD value count = 3.*

#### Testing Answers and Triple Stumper Detector

In [None]:
df_ans = acquire.answers(test_url)
df_ans

#### Jeopardy Questions

In [None]:
df_j_qs = acquire.j_qs(test_url)
df_j_qs

#### Double Jeopardy Questions

In [None]:
df_dj_qs = acquire.dj_qs(test_url)
df_dj_qs

#### Final Jeopardy

In [None]:
df_fj = acquire.fj(test_url)
df_fj

#### Show Dataframe

In [None]:
df_game = acquire.show_dataframe(df_dd, df_ans, df_j_qs, df_dj_qs, df_fj)
df_game

#### Add Level

In [None]:
df_game = acquire.add_level(df_game)
df_game

In [None]:
df_game.info()

#### Altogether Now

In [None]:
df_game2 = acquire.acquire_show(test_url)
df_game2.to_csv('show_8248.csv')

In [None]:
output = acquire.acquire_shows('showgame.php?game_id=7407')

In [None]:
output.sample(25)

In [None]:
df = pd.read_csv('jeopardy_games.csv')

In [None]:
df

In [None]:
dataframe_list = ['a','b','c']
path = 'cached_games/'
new_list = [f'{path}{n}.csv' for n in dataframe_list]

In [None]:
from acquire import merge_dataframes

In [None]:
df_list = ['a','b','c','d','e','f','g','h','i','j','k']

In [None]:
df = merge_dataframes(df_list)

In [None]:
df.info()

In [None]:
df_values = df[df['value'] != 'FJ']

In [None]:
df_values

In [None]:
df_values = df_values.astype({'value':'int'})

In [None]:
df_values = df_values.drop(columns = 'index')

In [None]:
df_values

In [None]:
df_dd = df_values[df_values.is_DD == 1]

In [None]:
df_dd_j = df_dd[df_dd['round'] == 'Jeopardy']
df_dd_dj = df_dd[df_dd['round'] == 'Double Jeopardy']

In [None]:
df_dd_j['value'].value_counts(normalize = True)

In [None]:
df_dd_dj['value'].value_counts(normalize = True)

Cute observation - if the answer is 'Null' - python interprets it as a NaN!

In [None]:
df = df.fillna(f'Null')

In [None]:
df.answer.value_counts().nlargest(50)

In [None]:
df[df.answer.duplicated(keep=False)]

Numer of final jeopardies

In [None]:
fj = df[df['round'] == 'Final Jeopardy']

In [None]:
fj.answer.value_counts().nlargest(50)

In [None]:
stumpers = df[df.is_stumper == True]

In [None]:
stumpers['value'].value_counts().index

In [None]:
plt.figure(figsize = (24,24))
sns.distplot(stumpers['value'])

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
[n for n in final_jeopardy.clue if 'Dutch' in n]

get rid of all seasons pre- 