In [7]:
import matplotlib.pyplot as ply
%matplotlib inline
from scipy.stats import chisquare
import pandas as pd
import numpy as np
import os
import re

In [8]:
home = os.path.expanduser('~')
jeopardy = pd.read_csv(f"{home}/Dropbox/My-Portfolio/DataScience/Data/jeopardy.csv")
clean_cols = [col.strip() for col in jeopardy.columns]
jeopardy.columns = clean_cols
jeopardy.head(3)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona


In [9]:
def normalize_text(text):
    norm = text.lower()
    #norm_sentence =  "".join([c for c in norm if c.isalnum() or c == ' '])
    norm_sentence = re.sub('[^a-zA-Z0-9 ]+', '', norm)
    return norm_sentence.strip()

for col in ['Question','Answer','Value']:
    jeopardy[col] = jeopardy[col].apply(normalize_text)
jeopardy['Value'] = pd.to_numeric(jeopardy['Value'], errors='coerce')
jeopardy['Value'].fillna(0, inplace=True)
jeopardy['Value'] = jeopardy['Value'].astype(int)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy.head(2)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,200,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,no 2 1912 olympian football star at carlisle i...,jim thorpe


In [10]:
def answer_in_question(row):
    question_words = [w for w in row['Question'].split(' ') 
                      if w not in ('the', 'a')]
    answer_words = row['Answer'].split(' ')
    matches =  set([q for q in question_words if q in answer_words])
    if len(matches) == 0 or len(answer_words) == 0:
        return 0
    return len(matches) / len(answer_words)

jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis='columns')
(jeopardy['answer_in_question'].value_counts(normalize=True, bins=4) * 100).sort_index()

(-0.002, 0.25]    92.549627
(0.25, 0.5]        6.570329
(0.5, 0.75]        0.335017
(0.75, 1.0]        0.545027
Name: answer_in_question, dtype: float64

In [11]:
def repeated_questions(question, found_words):
    question_words = [w for w in question.split(' ') if len(w) >= 6]
    found = [q for q in question_words if q in found_words]
    found_words.update(question_words)
    if len(found) == 0 or len(question_words) == 0:
        return 0
    return len(found) / len(question_words)

found_words = set()
jeopardy['repeated_question'] = jeopardy['Question'].apply(repeated_questions, args=(found_words,))
(jeopardy['repeated_question'].value_counts(normalize=True, bins=4) * 100).sort_index()

(-0.002, 0.25]    10.940547
(0.25, 0.5]       17.730887
(0.5, 0.75]       26.016301
(0.75, 1.0]       45.312266
Name: repeated_question, dtype: float64

In [12]:
def high_value_words(row, found_words, word_values):
    question_value = 'high' if row['Value'] > 800 else 'low'
    question = set(row['Question'].split(' '))
    found = question & found_words
    if len(found) > 0:
        for word in found:
            word_values.setdefault(word,{'high':0, 'low':0})[question_value] += 1

word_values = {}
jeopardy.apply(high_value_words, args=(found_words, word_values),axis='columns')
word_values = pd.DataFrame(word_values).transpose()
word_values.sort_values('high', inplace=True, ascending=False)
word_values.head(10)

Unnamed: 0,high,low
called,168,346
country,141,332
french,108,133
targetblankherea,97,146
became,79,203
famous,78,168
played,77,212
american,77,174
island,73,134
before,71,191


In [13]:
def chisquare_high_value_words(row):
    if row['high'] > 0 and row['high'] > row['low']:
        chi, p = chisquare(row.values, ddof=0)
        return p
    else:
        return np.nan
word_values['significant_high_value'] = word_values.apply(chisquare_high_value_words, axis='columns')
word_values.sort_values('significant_high_value', inplace=True)
word_values

Unnamed: 0,high,low,significant_high_value
monitora,35,13,0.001496
largely,5,0,0.025347
travis,4,0,0.045500
thinker,4,0,0.045500
democrats,4,0,0.045500
expatriate,4,0,0.045500
targetblankdr,4,0,0.045500
backwards,4,0,0.045500
creates,6,1,0.058782
raisins,3,0,0.083265
