In [1]:
import pandas
import csv

jeopardy = pandas.read_csv("jeopardy.csv")

In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [4]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [5]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [7]:
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["Air Date"])

In [8]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [9]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

In [10]:
jeopardy["answer_in_question"].mean()

0.060493257069335872

In [11]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.69087373156719623

In [12]:
def value_high(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0

jeopardy["high_value"] = jeopardy.apply(value_high, axis = 1)    

In [15]:
def func(word, jeopardy):
    low_count = 0
    high_count = 0
    
    for id, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[0:5]

for each in comparison_terms:
    l = func(each, jeopardy)
    observed_expected.append(l)
    

In [21]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
print(high_value_count)

5734


In [22]:
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
print(low_value_count)

14265


In [26]:
from scipy.stats.mstats import chisquare
import numpy as np

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    
    expect_high = total_prop * high_value_count
    expect_low  = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([expect_high, expect_low])
    chi_squared.append(chisquare(observed, expected))
    

    

In [27]:
chi_squared

[Power_divergenceResult(statistic=1.2058885383806519, pvalue=0.27214791766902047),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.66809416232506025),
 Power_divergenceResult(statistic=0.031881167234403623, pvalue=0.85828871632352932),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]