An analysis of Jeopardy questions and answers, with guidance from Dataquest.

In [10]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [11]:
new_columns = []
for column in jeopardy.columns:
    new_columns.append(column.replace(" ", ""))
jeopardy.columns = new_columns
print(jeopardy.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [12]:
from string import punctuation
def normalize(phrase):
    lower = phrase.lower()
    return ''.join(c for c in lower if c not in punctuation)

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
print(jeopardy["clean_question"])

0        for the last 8 years of his life galileo was u...
1        no 2 1912 olympian football star at carlisle i...
2        the city of yuma in this state has a record av...
3        in 1963 live on the art linkletter show this c...
4        signer of the dec of indep framer of the const...
5        in the title of an aesop fable this insect sha...
6        built in 312 bc to link rome  the south of ita...
7        no 8 30 steals for the birmingham barons 2306 ...
8        in the winter of 197172 a record 1122 inches o...
9        this housewares store was named for the packag...
10                                          and away we go
11       cows regurgitate this from the first stomach t...
12       in 1000 rajaraja i of the cholas battled to ta...
13       no 1 lettered in hoops football  lacrosse at s...
14       on june 28 1994 the natl weather service began...
15       this companys accutron watch introduced in 196...
16       outlaw murdered by a traitor and a coward whos.

In [13]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)
print(jeopardy["clean_answer"])

0                                               copernicus
1                                               jim thorpe
2                                                  arizona
3                                                mcdonalds
4                                               john adams
5                                                  the ant
6                                           the appian way
7                                           michael jordan
8                                               washington
9                                            crate  barrel
10                                          jackie gleason
11                                                 the cud
12                                     ceylon or sri lanka
13                                               jim brown
14                                            the uv index
15                                                  bulova
16                                             jesse jam

In [14]:
def norm_value(value):
    num = ''.join(c for c in value if c not in punctuation)
    try:
        result = int(num)
    except Exception:
        result = 0
    return result
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
print(jeopardy["clean_value"])

0         200
1         200
2         200
3         200
4         200
5         200
6         400
7         400
8         400
9         400
10        400
11        400
12        600
13        600
14        600
15        600
16        600
17        600
18        800
19        800
20        800
21        800
22       2000
23        800
24       1000
25       1000
26       1000
27       1000
28       1000
29        400
         ... 
19969    1200
19970    1200
19971    1500
19972    1200
19973    1200
19974    1200
19975    1600
19976    1600
19977    1600
19978    1600
19979    1600
19980    1600
19981    1200
19982    2000
19983    2000
19984    2000
19985    2000
19986    2000
19987       0
19988     100
19989     100
19990     100
19991     100
19992     100
19993     100
19994     200
19995     200
19996     200
19997     200
19998     200
Name: clean_value, dtype: int64


In [15]:
jeopardy["AirDate"] = jeopardy["AirDate"].apply(pd.to_datetime)
print(jeopardy["AirDate"])

0       2004-12-31
1       2004-12-31
2       2004-12-31
3       2004-12-31
4       2004-12-31
5       2004-12-31
6       2004-12-31
7       2004-12-31
8       2004-12-31
9       2004-12-31
10      2004-12-31
11      2004-12-31
12      2004-12-31
13      2004-12-31
14      2004-12-31
15      2004-12-31
16      2004-12-31
17      2004-12-31
18      2004-12-31
19      2004-12-31
20      2004-12-31
21      2004-12-31
22      2004-12-31
23      2004-12-31
24      2004-12-31
25      2004-12-31
26      2004-12-31
27      2004-12-31
28      2004-12-31
29      2004-12-31
           ...    
19969   2009-05-14
19970   2009-05-14
19971   2009-05-14
19972   2009-05-14
19973   2009-05-14
19974   2009-05-14
19975   2009-05-14
19976   2009-05-14
19977   2009-05-14
19978   2009-05-14
19979   2009-05-14
19980   2009-05-14
19981   2009-05-14
19982   2009-05-14
19983   2009-05-14
19984   2009-05-14
19985   2009-05-14
19986   2009-05-14
19987   2009-05-14
19988   2000-03-14
19989   2000-03-14
19990   2000

In [16]:
def find_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)
jeopardy["answer_in_question"] = jeopardy.apply(find_matches, axis=1)
print(jeopardy["answer_in_question"])

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
7        0.000000
8        0.000000
9        0.333333
10       0.000000
11       0.000000
12       0.000000
13       0.000000
14       0.500000
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.000000
21       0.000000
22       0.000000
23       0.000000
24       0.500000
25       0.000000
26       0.000000
27       0.000000
28       0.000000
29       0.000000
           ...   
19969    0.000000
19970    0.000000
19971    0.000000
19972    0.000000
19973    0.000000
19974    0.333333
19975    0.000000
19976    0.000000
19977    0.000000
19978    0.000000
19979    0.000000
19980    0.500000
19981    0.500000
19982    0.000000
19983    0.000000
19984    0.000000
19985    0.000000
19986    0.000000
19987    0.000000
19988    0.000000
19989    0.000000
19990    0.000000
19991    0.000000
19992    0.000000
19993    0

In [17]:
print(jeopardy["answer_in_question"].mean())

0.0603527738547


In [46]:
question_overlap = []
terms_used = set()
for row in jeopardy.iterrows():
    split_question = row[1][7].split(" ")
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
print(question_overlap[0:10])

[0.0, 0.08333333333333333, 0.1, 0.1111111111111111, 0.3333333333333333, 0.3333333333333333, 0.0, 0.5714285714285714, 0.3333333333333333, 0.25]


In [30]:
jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())

0.803565789053


In [33]:
def rate_value(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy["high_value"] = jeopardy.apply(rate_value, axis=1)
print(jeopardy["high_value"])

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       1
23       0
24       1
25       1
26       1
27       1
28       1
29       0
        ..
19969    1
19970    1
19971    1
19972    1
19973    1
19974    1
19975    1
19976    1
19977    1
19978    1
19979    1
19980    1
19981    1
19982    1
19983    1
19984    1
19985    1
19986    1
19987    0
19988    0
19989    0
19990    0
19991    0
19992    0
19993    0
19994    0
19995    0
19996    0
19997    0
19998    0
Name: high_value, dtype: int64


In [37]:
def count_high_value(word):
    low_count = 0
    high_count = 0
    for row in jeopardy.iterrows():
        split_question = row[1][7].split(" ")
        for item in split_question:
            if item == word:
                if row[1][12] == 1:
                    high_count += 1
                else:
                    low_count += 1
    return high_count, low_count
observed_expected = []
comparison_terms = list(terms_used)[0:5]
print(comparison_terms)

['', 'organist', 'bassanis', 'nail', 'disputes']


In [39]:
for item in comparison_terms:
    observed_expected.append(count_high_value(item))
print(observed_expected)

[(1638, 3778), (0, 1), (0, 1), (2, 4), (1, 0), (1638, 3778), (0, 1), (0, 1), (2, 4), (1, 0)]


In [42]:
high_value_count = 0
low_value_count = 0
for value in jeopardy["high_value"]:
    if value == 1:
        high_value_count += 1
    if value == 0:
        low_value_count += 1
print(high_value_count, low_value_count)

5734 14265


In [44]:
from scipy.stats import chisquare
chi_squared = []
for item in observed_expected:
    total = item[0] + item[1]
    total_prop = total / jeopardy.shape[0]
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    chi_squared.append(chisquare(item, [expected_high, expected_low]))
for item in chi_squared:
    print(item)

Power_divergenceResult(statistic=6.5468193328041924, pvalue=0.010507182744858646)
Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)
Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)
Power_divergenceResult(statistic=0.063762334468807247, pvalue=0.80064530268787815)
Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)
Power_divergenceResult(statistic=6.5468193328041924, pvalue=0.010507182744858646)
Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)
Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)
Power_divergenceResult(statistic=0.063762334468807247, pvalue=0.80064530268787815)
Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)
