# Winning Jeopardy
A jupyter notebook to analyse questions on joepardy to detect trends and aid future contestants.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.columns)
jeopardy.head()

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


Rename columns as there is a leading " " in mny column headers

In [3]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

Nomalise columns by removing case and punctuation

In [4]:
def normalise_str(s):
    s = s.lower()
    s = re.sub(r'[^\w\s]','',s)
    return s

In [5]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalise_str)
jeopardy['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [6]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalise_str)
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

Now to normalise dollar value

In [7]:
def normalise_usd(s):
    s = re.sub(r'[^\w\s]','',s)
    try:
        i = int(s)
    except:
        i = 0
    return i

jeopardy['clean_value'] = jeopardy['Value'].apply(normalise_usd)
jeopardy['clean_value'].head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

Convert date to actual datetime

In [8]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy['Air Date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

Calculate the number of repeated answers

In [9]:
def detect_dups(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    
    match_count = 0
    try:
        split_answer.remove('the')
    except:
        pass
    
    if len(split_answer) == 0:
        return 0
    
    for i in split_answer:
        if i in split_question:
            match_count += 1
            
    return match_count / len(split_answer) 

In [10]:
jeopardy['answer_in_question'] = jeopardy.apply(detect_dups, axis = 1)
jeopardy['answer_in_question'].mean()

0.06049325706933587

The answer only appears in the question 6% of the time

In [11]:
question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count += 1
        else:
            terms_used.add(i)
    if len(split_question) > 0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6925935056088584

Terms are re-used 70% of the time

### Explore terms that come up more commonly in high vlaue questions

In [12]:
def determine_vlaue(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    return value

jeopardy['high_value'] = jeopardy.apply(determine_vlaue, axis=1)

In [14]:
def count_words(w):
    low_count = 0 
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if w in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[0:5]

for term in comparison_terms:
    observed_expected.append(count_words(term))

print(observed_expected)

[(0, 2), (0, 1), (0, 1), (0, 1), (1, 1)]


In [18]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

In [19]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996)]

nill significant