In [1]:
import pandas as pd
import datetime
import string
from scipy.stats import chisquare
import numpy as np

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [3]:
col = list(jeopardy.columns)
col = pd.Series(col).str.replace(' ', '')
jeopardy.columns = col
print(jeopardy.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [4]:
#jeopardy.info()
#jeopardy['Value'].value_counts()
jeopardy.dtypes


ShowNumber     int64
AirDate       object
Round         object
Category      object
Value         object
Question      object
Answer        object
dtype: object

In [5]:
jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])
jeopardy['Value'] = pd.to_numeric(jeopardy['Value'].str.replace('$',''), errors='coerce')
jeopardy['Value'] = jeopardy['Value'].fillna(0)
#jeopardy.describe(include='all')

In [6]:
def string_normalize(series_in):
    series_in = series_in.lower()
    series_in = series_in.translate(str.maketrans("","", string.punctuation))
    return series_in

In [7]:
jeopardy['Question'] = jeopardy['Question'].apply(string_normalize)
jeopardy['Answer'] = jeopardy['Answer'].apply(string_normalize)
jeopardy.describe(include='all')

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
count,19999.0,19999,19999,19999,19999.0,19999,19999
unique,,336,4,3581,,19987,14226
top,,2007-11-13 00:00:00,Jeopardy!,TELEVISION,,audio clue,japan
freq,,62,9901,51,,5,22
first,,1984-09-21 00:00:00,,,,,
last,,2012-01-19 00:00:00,,,,,
mean,4312.730537,,,,661.39392,,
std,1374.121672,,,,506.136278,,
min,10.0,,,,0.0,,
25%,3393.0,,,,300.0,,


In [8]:
def find_deducibility(question, answer):
    match_count = 0
    s_question = question.split(' ')
    s_answer = answer.split(' ')
    if 'the' in s_answer:
        s_answer.remove('the')
    if len(s_answer) == 0:
        return 0
    else:
        for an in s_answer:
            if an in s_question:
                match_count += 1
    return match_count/len(s_answer)

In [9]:
jeopardy['AnswerInQuestion'] = jeopardy[['Question','Answer']].apply(lambda x: find_deducibility(x.Question, x.Answer), axis=1)
#print(jeopardy[['Question', 'Answer', 'AnswerInQuestion']].head(10))

In [10]:
jeopardy['AnswerInQuestion'].mean()

0.06035277385469894

In [11]:
jeopardy = jeopardy.sort_values(by='AirDate')

In [12]:
question_overlap = []
terms_used = set()
for row in jeopardy['Question'].iteritems():
    s_question = row[1].split(' ')
    s_question= [rec for rec in s_question if len(rec) >= 6]   
    match_count = 0
    for rec in s_question:
        if rec in terms_used:
            match_count += 1
    for rec in s_question:
        terms_used.add(rec)
    if len(s_question) > 0:
        match_count = match_count / len(s_question)
    question_overlap.append(match_count)

jeopardy['QuestionOverlap'] = question_overlap

In [13]:
jeopardy['QuestionOverlap'].mean()
#jeopardy['QuestionOverlap'].head(10)

0.687124288096678

In [14]:
def determine_value(row):
    value = 0
    if row["Value"] > 800:
        value = 1
    return value

jeopardy["HighValue"] = jeopardy.apply(determine_value, axis=1)

In [17]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["Question"].split(" "):
            if row["HighValue"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1), (0, 1), (3, 2), (5, 8), (0, 1)]

In [18]:
high_value_count = jeopardy[jeopardy["HighValue"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["HighValue"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=3.304881915153957, pvalue=0.06907430977207146),
 Power_divergenceResult(statistic=1.2872212168394972, pvalue=0.2565606771848403),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378)]