# Winning Jeopardy

In [1]:
import pandas as pd
import numpy as np

jeopardy = pd.read_csv("jeopardy.csv")

In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = jeopardy.columns.str.replace(" ", "")

In [4]:
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
import re

def normalizer(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]","",text)
    text = re.sub("\s+", " ", text)
    return text

In [6]:
jeopardy['clean_question'] = jeopardy["Question"].apply(normalizer)
jeopardy['clean_answer'] = jeopardy["Answer"].apply(normalizer)

In [7]:
def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [8]:
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [9]:
pd.to_datetime(jeopardy["AirDate"])

0       2004-12-31
1       2004-12-31
2       2004-12-31
3       2004-12-31
4       2004-12-31
5       2004-12-31
6       2004-12-31
7       2004-12-31
8       2004-12-31
9       2004-12-31
10      2004-12-31
11      2004-12-31
12      2004-12-31
13      2004-12-31
14      2004-12-31
15      2004-12-31
16      2004-12-31
17      2004-12-31
18      2004-12-31
19      2004-12-31
20      2004-12-31
21      2004-12-31
22      2004-12-31
23      2004-12-31
24      2004-12-31
25      2004-12-31
26      2004-12-31
27      2004-12-31
28      2004-12-31
29      2004-12-31
           ...    
19969   2009-05-14
19970   2009-05-14
19971   2009-05-14
19972   2009-05-14
19973   2009-05-14
19974   2009-05-14
19975   2009-05-14
19976   2009-05-14
19977   2009-05-14
19978   2009-05-14
19979   2009-05-14
19980   2009-05-14
19981   2009-05-14
19982   2009-05-14
19983   2009-05-14
19984   2009-05-14
19985   2009-05-14
19986   2009-05-14
19987   2009-05-14
19988   2000-03-14
19989   2000-03-14
19990   2000

In [10]:
def splitter(row):
    #split every word in columns
    split_answer = row["clean_answer"].split()
    split_question = row["clean_question"].split()
    
    match_count = 0 
    #remove "the" because it doesn't have any meaningful use
    if "the" in split_answer: 
        split_answer.remove("the")
    if len(split_answer) == 0:
        return
    #Count how many times terms in clean_answer occur in clean_question
    for item in split_answer:
        if item in split_question:
            match_count += 1 
            
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(splitter, axis = 1)

ans_in_ques_mean = jeopardy["answer_in_question"].mean()

In [11]:
ans_in_ques_mean

0.05900491564307945

## Recycled questions

In [12]:
question_overlap = []
terms_used = set()

jeopardy["AirDate"] = jeopardy["AirDate"].sort_values(ascending = False)

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [x for x in split_question if len(x) > 5]
    
    match_count = 0 
    
    for word in split_question: 
        if word in terms_used:
            match_count += 1
    for word in split_question: 
        terms_used.add(word)
        
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy["question_overlap"] = question_overlap

All rows were filtered, we keep only rows which have six or more characters in words to filter out from "the", "than", etc. which are commonly used, but don't tell you about a lot about a question.

## Low value vs high value questions

In [13]:
def determine_value(row):
    value = 0
    if int(row["clean_value"]) > 800:
        value = 1
    else: 
        value = 0
    return value

In [14]:
jeopardy["high_value"] = jeopardy.apply(determine_value, axis = 1)

In [15]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [16]:
from random import choice

terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(1, 2),
 (3, 6),
 (2, 2),
 (0, 1),
 (3, 4),
 (1, 2),
 (1, 0),
 (0, 1),
 (0, 1),
 (1, 10)]

## Applying the chi-squared test

In [22]:
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0],obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed,expected))

In [23]:
chi_squared

[Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293),
 Power_divergenceResult(statistic=0.09564350170321084, pvalue=0.75712159875701),
 Power_divergenceResult(statistic=0.889754963322559, pvalue=0.3455437191483468),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.6887906561130311, pvalue=0.4065760282166111),
 Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.0621887936258245, pvalue=0.15099314777510028)]

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.