In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# READ IN THE DATA
jeopardy = pd.read_csv('jeopardy.csv')

In [3]:
# CHECK OUT THE DATA
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
# REMOVE WHITESPACES IN FRONT OF COL NAMES
jeopardy.columns.str.lstrip()

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [6]:
jeopardy.columns = jeopardy.columns.str.lstrip()

In [7]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

In [8]:
# DEFINE A FUNCTION THAT REMOVES PUNCTUATIONS AND CONVERT TO LOWERCASE
def normalize_str(string):
    string = string.lower()
    from string import punctuation
    return string.translate(str.maketrans('', '', punctuation))

In [9]:
# USE THE FUNCTION AND NORMALIZE COLUMN 'QUESTION' AND 'ANSWER'
jeopardy['clean_question'] = jeopardy['Question'].apply(lambda x: normalize_str(x))
jeopardy['clean_answer'] =  jeopardy['Answer'].apply(lambda x: normalize_str(x))

In [10]:
jeopardy[['clean_question', 'clean_answer']].head()

Unnamed: 0,clean_question,clean_answer
0,for the last 8 years of his life galileo was u...,copernicus
1,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,the city of yuma in this state has a record av...,arizona
3,in 1963 live on the art linkletter show this c...,mcdonalds
4,signer of the dec of indep framer of the const...,john adams


In [11]:
# NORMALIZE DOLLAR COL
def normalize_dol(dollar):
    from string import punctuation
    dollar = dollar.translate(str.maketrans('', '', punctuation))
    try:
        return int(dollar)
    except:
        return None

In [12]:
jeopardy['clean_value'] = jeopardy['Value'].apply(lambda x: normalize_dol(x))

In [13]:
# CONVERT AIR DATE COL TO DATETIME
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [14]:
# FINALLY, DATA IS READY FOR ANALYSIS
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value              float64
dtype: object

### How often is the answer deducible from the question?

In [15]:
# LET'S SEE WHAT WORDS OCCUR IN QUESTION AND ANSWER
all_words_in_question = []
for string in jeopardy['clean_question']:
    words = string.split(' ')
    all_words_in_question.extend(words)
pd.Series(all_words_in_question).value_counts()[:10]

the     16459
this    11676
of      10631
in       9485
a        9479
         5416
to       4789
for      3353
is       3273
was      2715
dtype: int64

In [16]:
all_words_in_answer = []
for string in jeopardy['clean_answer']:
    words = string.split(' ')
    all_words_in_answer.extend(words)
pd.Series(all_words_in_answer).value_counts()[:10]

the     2531
a       1252
of       504
         314
john     198
an       156
and      132
or       126
new      105
to        96
dtype: int64

In [17]:
def match_counter(row):
    split_answer = row['clean_answer'].split(' ')
    if 'the' in split_answer:
        split_answer.remove('the')
    split_question = row['clean_question'].split(' ')
    
    match_count = 0
    
    if len(split_answer) == 0:
        return 0
    else:
        for word in split_answer:
            if word in split_question:
                match_count += 1
        return match_count / len(split_answer)

In [18]:
answer_in_question = jeopardy.apply(match_counter, axis=1)

In [19]:
answer_in_question.mean()

0.060352773854699004

The value 0.06 of anwer_in_question variable shows that approximately 6% of the words in the answers also appear in the questions. This is not 'accurate' because as I've shown above, the answers and questions both contain many words that are not exactly part of the answer, including a, an, or and to. In summary, words in answers hardly show up in the given questions, perhaps less than 6%.

### How often are new questions repeats of older questions?

In [20]:
jeopardy.sort_values(by='Air Date', inplace=True)

question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    
    for word in split_question:
        if word in terms_used:
            match_count += 1
                
        terms_used.add(word)
    
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
        
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6889055316620302

The data shows that 69% of the time, the words in a given question appear also in the ones given in the past. This may not be significant because it looks at individual words rather than whole phrases, but still, it implies possibilities of questions recycling.

In [21]:
def high_value(row):
    if row['clean_value'] > 800:
        return 1
    else:
        return 0
    
# ASSIGN BOOLEAN VALUES
jeopardy['high_value'] = jeopardy.apply(high_value, axis=1)

In [22]:
def count_high_low(word):
    high_count = 0
    low_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count +=1
    return high_count, low_count

In [23]:
observed_expected = []
comparison_terms = list(terms_used)[:10]

for word in comparison_terms:
    observed_expected.append(list(count_high_low(word)))

In [24]:
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]

chi_squared = []
for word in observed_expected:
    # NUM OF APPEARANCE
    total = sum(word)
    
    # PROPORTION OF NUM OF APPEARANCE TO TOTAL NUM OF QUESTIONS
    total_prop = total / jeopardy.shape[0]
    
    # EXPECTED FREQUENCIES
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    
    # CONDUCT CHI-SQUARE TEST
    # H0: THE EXPECTED AND THE OBSERVED FREQUENCIES ARE THE SAME
    from scipy.stats import chisquare
    chi, pval = chisquare(word, [expected_high, expected_low])
    chi_squared.append([chi, pval])

In [25]:
pd.DataFrame(chi_squared, columns=['chi', 'pval'])[:10]

Unnamed: 0,chi,pval
0,2.487792,0.114733
1,2.487792,0.114733
2,0.401963,0.526077
3,0.401963,0.526077
4,0.444877,0.504778
5,0.401963,0.526077
6,1.607851,0.204794
7,0.803926,0.369922
8,0.401963,0.526077
9,0.401963,0.526077


Most of the words do not have significant difference in usage between high-value questions and low-value questions.