In [21]:
import pandas as pd

In [22]:
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


### Remove the spaces in front of some column names

In [23]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

### Normalize Question and Answer text columns by lowercase words and remove punctuation so Don't and don't aren't considered to be different words when we compare them.

In [24]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

jeopardy["clean_question"] = jeopardy['Question'].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy['Answer'].apply(normalize_text)

### Normalize Value column by removing the dollar sign from the beginning of each value and convert the column from text to numeric

In [25]:
import re

def normalize_dollar_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_dollar_values)

### Convert the Air Date column to a datetime column

In [26]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

### Let's find out how often the answer is deducible from the question

In [27]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

# Count how many times terms in clean_answer occur in clean_question. 
# Pass the axis=1 argument to apply the function across each row.
jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)
jeopardy["answer_in_question"].mean()

0.060493257069335872

Answer only appears in the question about 6% of the time. This conclude that we can't hope finding answer just by hearing the question.

### Let's find out how often new questions are repeats of older ones

In [28]:
question_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [word for word in split_question if len(word) > 6]
    match_count = 0
    
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.62617162774588297

There are 62% overlap of words in new question and old questions. It's worth looking in old questions.

### Let's focus on questions that pertain to high value questions instead of low value questions

In [32]:
def find_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(find_value, axis=1)       

In [33]:
def count_usage(word):
    low_count = 0
    high_count = 0
    
    for index, row in jeopardy.iterrows():
        clean_question = row["clean_question"].split(" ")
        if word in clean_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected
    

[(2, 2), (0, 1), (1, 1), (1, 1), (1, 0)]

## Applying the Chi-square test

In [42]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.88975496332255899, pvalue=0.34554371914834681),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

None of the words had a significant difference in usage between high value and low value rows. The frequencies were all lower than 5. Thus the chi-square test is invalid.