In [24]:
import pandas as pd
import re
from scipy.stats import chisquare
import numpy as np

jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [25]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [26]:
j_cols = jeopardy.columns.values

for i, c in enumerate(j_cols):
    j_cols[i] = c.strip()
    
jeopardy.columns = j_cols

In [27]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [28]:
def normalize_text(string):
    string = string.lower()
    to_remove = "[^A-Za-z0-9\s]"
    string = re.sub(to_remove, "", string)
    
    return string

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)

In [29]:
def normalize_values(string):
    to_remove = "[^A-Za-z0-9\s]"
    string = re.sub(to_remove, "", string)
    try:
        string = int(string)
    except Exception:
        string = 0
    return string

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [30]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer', 'clean_question', 'clean_answer', 'clean_value'],
      dtype='object')

In [31]:
def question1(row):
    split_answer = row[8].split(" ")
    split_question = row[7].split(" ")
    match_count = 0
    
    if "the" in split_answer:
        split_answer.remove("the")
    
    if len(split_answer) == 0:
        return 0
    
    for word in split_answer:
        if word in split_question:
            match_count += 1
            
    normalized_match_count = match_count/len(split_answer)

    return normalized_match_count

jeopardy["answer_in_question"] = jeopardy.apply(question1,axis=1)

print(jeopardy["answer_in_question"].mean())

0.0604932570693


This very low mean suggests that only 6% of word used in answers are also used in the question to those answers. The implication of this is that it is beneficial to study past questions and general knowledge.

In [32]:
question_overlap = []
terms_used = set()

for row in jeopardy.iterrows():
    split_question = row[1][7].split(" ")
    split_question = [x for x in split_question if len(x) > 5]
    match_count = 0
    
    for word in split_question:
        if word in terms_used:
            match_count += 1
        
    for word in split_question:
        terms_used.add(word)
    
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
        
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap

print(jeopardy["question_overlap"].mean())

0.690873731567


This value suggests that 70% of meaningful words used in questions have already been used. This suggests there are some new topics being introduced, but mostly question topics are being recycled.

In [33]:
def high_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value
    
jeopardy["high_value"] = jeopardy.apply(high_value, axis=1)

def high_low_count(word):
    low_count = 0
    high_count = 0
    
    for row in jeopardy.iterrows():
        if word in row[1][7].split(" "):
            if row[1][10] == 1:
                high_count += 1
            else:
                low_count += 1
        
    return high_count, low_count

observed_expected = []

comparison_terms = list(terms_used)[:20]

for term in comparison_terms:
    observed_expected.append(high_low_count(term))

In [34]:
print(observed_expected)

[(0, 1), (0, 1), (0, 8), (0, 12), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 5), (0, 1), (0, 20), (0, 2), (0, 1), (0, 2), (0, 1), (0, 1), (0, 1), (0, 1)]


In [35]:
high_value_count = len(jeopardy[jeopardy["high_value"]==1])
low_value_count = len(jeopardy[jeopardy["high_value"]==0])
chi_squared = []

for ob in observed_expected:
    total = ob[0] + ob[1]
    total_prop = total/len(jeopardy)
    e_count_high = total_prop * high_value_count
    e_count_low = total_prop * low_value_count
    
    observed = np.array([ob[0], ob[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    
    chi, p = chisquare()

NameError: name 'obs' is not defined