In [43]:
import pandas as pd
import re
from random import choice
from scipy.stats import chisquare
import numpy as np

In [2]:
jeopardy = pd.read_csv('JEOPARDY_CSV.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


Each row in the dataset represents a single question on a single episode of Jeopardy. Here are explanations of each column:

**Show Number** - the Jeopardy episode number
**Air Date** - the date the episode aired
**Round** - the round of Jeopardy
**Category** - the category of the question
**Value** - the number of dollars the correct answer is worth
**Question** - the text of the question
**Answer** - the text of the answer

In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

Some of the column names have spaces in front, have to remove the spaces:


In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [5]:
#writing e function to clean the texts : make them lower case, remove punctuations, remove extra spaces
def Normalize_text(string):
    string = str(string)
    string = string.lower()
    string = re.sub(r"[^A-Za-z0-9\s]", "", string)
    string = re.sub(r"\s+", " ", string)
    return string



In [6]:
jeopardy['clean_question'] = jeopardy['Question'].apply(Normalize_text)
jeopardy['clean_question']

0         for the last 8 years of his life galileo was u...
1         no 2 1912 olympian football star at carlisle i...
2         the city of yuma in this state has a record av...
3         in 1963 live on the art linkletter show this c...
4         signer of the dec of indep framer of the const...
                                ...                        
216925    this puccini opera turns on the solution to 3 ...
216926    in north america this term is properly applied...
216927    in penny lane where this hellraiser grew up th...
216928    from ft sill okla he made the plea arizona is ...
216929    a silent movie title includes the last name of...
Name: clean_question, Length: 216930, dtype: object

In [7]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(Normalize_text)
jeopardy['clean_answer']

0                             copernicus
1                             jim thorpe
2                                arizona
3                              mcdonalds
4                             john adams
                       ...              
216925                          turandot
216926                        a titmouse
216927                      clive barker
216928                          geronimo
216929    grigori alexandrovich potemkin
Name: clean_answer, Length: 216930, dtype: object

In [8]:
#writing a function to clean the value column and turn it to int:
def Normalize_value(value):
    value = str(value)
    # Remove all non-digit characters (keeps only numbers)
    value = re.sub(r"[^\d]", "", str(value))  # `str(value)` ensures it works even if input is not a string
    
    try:
        return int(value) if value else 0  # Return 0 if empty string after cleaning
    except ValueError:
        return 0

In [9]:
jeopardy['clean_value'] = jeopardy['Value'].apply(Normalize_value)
jeopardy['clean_value']

0          200
1          200
2          200
3          200
4          200
          ... 
216925    2000
216926    2000
216927    2000
216928    2000
216929       0
Name: clean_value, Length: 216930, dtype: int64

In [10]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [11]:
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ital...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400


In [12]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

#  Answers in Questions
In order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out two things:

- How often the answer can be used for a question.
- How often questions are repeated

In [21]:
def matches_count(row) :
    splited_answer = row['clean_answer'].split()
    splited_question = row['clean_question'].split()
    if 'the' in splited_answer:
        splited_answer.remove('the')
    if len(splited_answer)== 0 :
        return 0
    match_count = 0
    for word in splited_answer:
        if word in splited_question:
            match_count+=1
    result = match_count/ len(splited_answer)
    return result



In [25]:
jeopardy['answers_in_question'] = jeopardy.apply(matches_count, axis=1)

In [23]:
jeopardy['answers_in_question'].mean()

np.float64(0.05792070323661065)

## Analysis of Answer-Question Overlap:
Our findings reveal that, on average, only 5.94% of an answer's content directly overlaps with its corresponding question. This minimal overlap suggests that:

**Low Predictive Power:** Hearing or reading a Jeopardy question provides very little direct clue about the answer's wording.

**Need for Better Strategies:** Since direct term matching is ineffective, we must explore alternative approaches to reliably predict answers.

# Recycled Questions



In [32]:
question_overlap=[]
terms_used= set()

jeopardy = jeopardy.sort_values('Air Date', ascending=True)

for i,row in jeopardy.iterrows():
    splited_question = row['clean_question'].split(" ")
    splited_question = [word for word in splited_question if len(word) > 5] 

    match_count = 0
    for words in splited_question:
        if words in terms_used:
            match_count+=1
    for words in splited_question:
        terms_used.add(words)

    if len(splited_question) >0:
        match_count /= len(splited_question)
    question_overlap.append(match_count)


In [33]:
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

np.float64(0.8722248751149544)

**Interpreting 87.22% Question Overlap:**
    
This indicates that 87.22% of long words (6+ letters) in each Jeopardy question appeared in prior questions—not full repeats.

# Low Value vs High Value Questions 


In [35]:
def value_determine(row):
    value = 0
    if row['clean_value'] >=800:
        value = 1
    return value

jeopardy['high_value'] = jeopardy.apply(value_determine, axis=1)


In [38]:
def usage_count(term):
    high_value = 0
    low_value = 0
    for i,row in jeopardy.iterrows():
        if term in row['clean_question'].split(' ') :
            if row['high_value'] == 1 :
                high_value += 1
            else:
                low_value +=1
    return high_value, low_value
    

In [41]:
terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]
observed_expected = []

for term in comparison_terms :
    observed_expected.append(usage_count(term))

observed_expected

[(0, 1),
 (1, 0),
 (1, 1),
 (0, 1),
 (1, 2),
 (0, 1),
 (1, 0),
 (1, 0),
 (1, 0),
 (20, 15)]

In [48]:
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]



In [49]:
chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total/ jeopardy.shape[0]
    exp_high_value = total_prop * high_value_count
    exp_low_value = total_prop * low_value_count

    observed = np.array([obs[0], obs[1]])
    expexted = np.array([exp_high_value, exp_low_value])
    chi_squared.append(chisquare(observed, expexted))


chi_squared


[Power_divergenceResult(statistic=np.float64(0.7544157608695651), pvalue=np.float64(0.38508176583769604)),
 Power_divergenceResult(statistic=np.float64(1.325529040972535), pvalue=np.float64(0.24960216618620146)),
 Power_divergenceResult(statistic=np.float64(0.039972400921050075), pvalue=np.float64(0.8415345528964892)),
 Power_divergenceResult(statistic=np.float64(0.7544157608695651), pvalue=np.float64(0.38508176583769604)),
 Power_divergenceResult(statistic=np.float64(0.11439736148359864), pvalue=np.float64(0.7351922887573981)),
 Power_divergenceResult(statistic=np.float64(0.7544157608695651), pvalue=np.float64(0.38508176583769604)),
 Power_divergenceResult(statistic=np.float64(1.325529040972535), pvalue=np.float64(0.24960216618620146)),
 Power_divergenceResult(statistic=np.float64(1.325529040972535), pvalue=np.float64(0.24960216618620146)),
 Power_divergenceResult(statistic=np.float64(1.325529040972535), pvalue=np.float64(0.24960216618620146)),
 Power_divergenceResult(statistic=np.flo

# Analysis of Chi-Squared Test Results
**1. Statistical Significance (p-values)**
All calculated p-values are above 0.05 (ranging from 0.09 to 0.84), indicating:

- No statistically significant difference in term usage between high-value and low-value questions.

- Term distribution appears independent of question value.

**2. Test Statistics**
- The chi-squared statistics (ranging from 0.04 to 2.86) are relatively small, confirming observed differences are likely due to random variation.

**3. Key Observations:**
- The highest statistic (2.86, p=0.09) approaches but doesn't reach significance.

- All other terms show even weaker associations (p-values 0.25-0.84).

**4. Limitations:**
- Low frequency issue: Most terms occurred <5 times per group, reducing test validity.

- General terms: Many analyzed terms may be common across all questions