In [52]:
import numpy as np
import pandas as pd

In [53]:
df = pd.read_csv('no_training_all_question_answers.csv')
df

Unnamed: 0,SEASON,CLUE VALUE,CLUE CATEGORY,QUESTION,OUR MODEL_ANSWER,REAL ANSWER,CORRECT,CONTEXT USED
0,34,200,REJECTED GREETING CARDS,"\""Have fun in"" this country! ""But remember the...",against the advice of the U.S. Department of S...,Afghanistan,False,"Global Relief Foundation . In November 2001, d..."
1,34,400,REJECTED GREETING CARDS,"\""It's OK, applicant! You didn't want to go to...",University of Houston,Stanford,False,Lauro Cruz . Born in Beaumont to Manuel Cruz a...
2,34,600,REJECTED GREETING CARDS,"\""Tummyache? Next time, take a close look at"" ...",,clams,False,No Context
3,34,800,REJECTED GREETING CARDS,"\""Nice surgery! You finally vented"" this soft,...",The spleen,spleen,True,Splenic injury . The spleen is an organ in the...
4,34,1000,REJECTED GREETING CARDS,"\""Sorry you two had"" these 14-letter ""differen...","""irreconcilable differences"",",irreconcilable,True,"Tony Parker . On 17 November 2010, Longoria fi..."
...,...,...,...,...,...,...,...,...
16310,16,100,COUNTRIES BY COLLEGE,"The University of Sunderland, the University o...",University of Northumbria at Newcastle.,England,False,Stephen Newton (artist) . Newton was born in G...
16311,16,200,COUNTRIES BY COLLEGE,"The Emile Cohl School, the University of Toulo...",,France,False,No Context
16312,16,300,COUNTRIES BY COLLEGE,"Chiba University, Waseda University, Fukuoka J...",Tokai University,Japan,False,Tokai University Fukuoka Junior College . Toka...
16313,16,200,HIRSCHFELD,Broadway's Diamond Lil; come up & see her some...,Diamond Lil (play,Mae West,False,Diamond Lil (play) . Diamond Lil is a 1928 pla...


In [54]:
# General stats

# s34 aired September 11, 2017
# s16 aired September 6, 1999
seasons = df['SEASON'].value_counts()
print(f'season # --> # of questions in that season\n{seasons}\n')

categories = df['CLUE CATEGORY'].value_counts()
print(f'category name --> # of questions in that category\n{categories}\n')

# total questions answered correctly/incorrectly
percent_answer_correct = df['CORRECT'].value_counts(normalize=True)
print(f'percent answered incorrectly and correctly\n{percent_answer_correct}\n')

season # --> # of questions in that season
34    13362
16     2953
Name: SEASON, dtype: int64

category name --> # of questions in that category
AMERICAN HISTORY          36
WORLD HISTORY             30
BEFORE & AFTER            25
BOOKS & AUTHORS           25
ISLANDS                   25
                          ..
CURRENT POLITICIANS        1
FIRST LADY FACTS           1
AMERICAN BUSINESS          1
U.S. POLITICAL HISTORY     1
ROMAN HISTORY              1
Name: CLUE CATEGORY, Length: 3338, dtype: int64

percent answered incorrectly and correctly
False    0.822924
True     0.177076
Name: CORRECT, dtype: float64



In [55]:
# Investigate answers that were empty strings
nan_rows = df[df['OUR MODEL_ANSWER'].isna()]
print(nan_rows)

percent_nan_rows = df['OUR MODEL_ANSWER'].isna().mean() * 100
print(f'percentage of answers that were empty strings aka Nan: {percent_nan_rows}')

# Investigate no context
percentage_with_no_context = (df['CONTEXT USED'] == 'No Context').mean() * 100
print(f"percentage of rows with 'no context' is: {percentage_with_no_context}")

       SEASON  CLUE VALUE             CLUE CATEGORY  \
2          34         600   REJECTED GREETING CARDS   
9          34         400           SELF-HELP BOOKS   
14         34         400             BACK IN BLACK   
16         34         800             BACK IN BLACK   
20         34         600           HIGHWAY TO HELL   
...       ...         ...                       ...   
16300      16        1000             \"B" YOURSELF   
16307      16         100  NEW YORK CITY TELEVISION   
16308      16         300  NEW YORK CITY TELEVISION   
16311      16         200      COUNTRIES BY COLLEGE   
16314      16         300                HIRSCHFELD   

                                                QUESTION OUR MODEL_ANSWER  \
2      \"Tummyache? Next time, take a close look at" ...              NaN   
9      In dealing with others, we need to set these l...              NaN   
14     On the verge of going bust in 1997, it was sav...              NaN   
16     It took more than a "mom

In [56]:
# Investigate question length

average_words = df['QUESTION'].str.split().apply(len).mean()
average_words_true = df[df['CORRECT'] == True]['QUESTION'].str.split().apply(len).mean()
average_words_false = df[df['CORRECT'] == False]['QUESTION'].str.split().apply(len).mean()

print(f'The average number of words in a question is: {average_words:.3f}')
print(f'The average number of words in a question for CORRECT=True is: {average_words_true:.3f}')
print(f'The average number of words in a question for CORRECT=False is: {average_words_false:.3f}')

The average number of words in a question is: 14.951
The average number of words in a question for CORRECT=True is: 15.489
The average number of words in a question for CORRECT=False is: 14.835


In [57]:
# Investigate named entities

# spacy built in entity types
# PERSON - People, including fictional.
# NORP - Nationalities or religious or political groups.
# FAC - Buildings, airports, highways, bridges, etc.
# ORG - Companies, agencies, institutions, etc.
# GPE - Countries, cities, states.
# LOC - Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT - Objects, vehicles, foods, etc. (Not services.)
# EVENT - Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART - Titles of books, songs, etc.

import spacy

nlp = spacy.load('en_core_web_lg')      # need to run python -m spacy download en_core_web_lg

def extract_named_entities(text):
    doc = nlp(str(text))
    entities = [ent.text for ent in doc.ents if ent.label_ == 'PERSON' or 
                                                ent.label_ == 'NORP' or
                                                ent.label_ == 'FAC' or
                                                ent.label_ == 'ORG' or
                                                ent.label_ == 'GPE' or
                                                ent.label_ == 'LOC' or
                                                ent.label_ == 'PRODUCT' or
                                                ent.label_ == 'EVENT' or
                                                ent.label_ == 'WORK_OF_ART']
    return entities

df['named_entities'] = df['REAL ANSWER'].apply(extract_named_entities)

percentage_with_named_entities = (df['named_entities'].apply(lambda x: bool(x))).mean() * 100

correct_rows = df[df['CORRECT'] == True]
incorrect_rows = df[df['CORRECT'] == False]

percentage_correct_with_named_entities = (correct_rows['named_entities'].apply(lambda x: bool(x))).mean() * 100
percentage_incorrect_with_named_entities = (incorrect_rows['named_entities'].apply(lambda x: bool(x))).mean() * 100

print(f"The percentage of rows with a named entity in 'REAL ANSWER': {percentage_with_named_entities:.3f}%")
print(f"The percentage of rows with a named entity in 'REAL ANSWER' where CORRECT=True: {percentage_correct_with_named_entities:.3f}%")
print(f"The percentage of rows with a named entity in 'REAL ANSWER' where CORRECT=False: {percentage_incorrect_with_named_entities:.3f}%")

percentage_without_named_entities = (~df['named_entities'].astype(bool)).mean() * 100

# percentage_correct_without_named_entities = (correct_rows['named_entities'].apply(lambda x: not bool(x))).mean() * 100
# percentage_incorrect_without_named_entities = (incorrect_rows['named_entities'].apply(lambda x: not bool(x))).mean() * 100

percentage_correct_without_named_entities = (~correct_rows['named_entities'].astype(bool)).mean() * 100
percentage_incorrect_without_named_entities = (~incorrect_rows['named_entities'].astype(bool)).mean() * 100

print(f"The percentage of rows without a named entity: {percentage_without_named_entities:.3f}%")
print(f"The percentage of correct rows without a named entity: {percentage_correct_without_named_entities:.3f}%")
print(f"The percentage of incorrect rows without a named entity: {percentage_incorrect_without_named_entities:.3f}%")

The percentage of rows with a named entity in 'REAL ANSWER': 49.635%
The percentage of rows with a named entity in 'REAL ANSWER' where CORRECT=True: 62.409%
The percentage of rows with a named entity in 'REAL ANSWER' where CORRECT=False: 46.887%
The percentage of rows without a named entity: 50.365%
The percentage of correct rows without a named entity: 37.591%
The percentage of incorrect rows without a named entity: 53.113%


In [58]:
def extract_people(text):
    doc = nlp(str(text))
    entities = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    return entities

df['PERSON'] = df['REAL ANSWER'].apply(extract_people)

percentage_with_named_entities = (df['PERSON'].apply(lambda x: bool(x))).mean() * 100

correct_rows = df[df['CORRECT'] == True]
incorrect_rows = df[df['CORRECT'] == False]

percentage_correct_with_named_entities = (correct_rows['PERSON'].apply(lambda x: bool(x))).mean() * 100
percentage_incorrect_with_named_entities = (incorrect_rows['PERSON'].apply(lambda x: bool(x))).mean() * 100

print(f"The percentage of rows with a PERSON named entity in 'REAL ANSWER': {percentage_with_named_entities:.3f}%")
print(f"The percentage of rows with a PERSON named entity in 'REAL ANSWER' where CORRECT=True: {percentage_correct_with_named_entities:.3f}%")
print(f"The percentage of rows with a PERSON named entity in 'REAL ANSWER' where CORRECT=False: {percentage_incorrect_with_named_entities:.3f}%")


The percentage of rows with a PERSON named entity in 'REAL ANSWER': 22.960%
The percentage of rows with a PERSON named entity in 'REAL ANSWER' where CORRECT=True: 28.210%
The percentage of rows with a PERSON named entity in 'REAL ANSWER' where CORRECT=False: 21.831%


In [59]:
def extract_places(text):
    doc = nlp(str(text))
    entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE' or
                                                ent.label_ == 'LOC']
    return entities

df['PLACES'] = df['REAL ANSWER'].apply(extract_places)

percentage_with_named_entities = (df['PLACES'].apply(lambda x: bool(x))).mean() * 100

correct_rows = df[df['CORRECT'] == True]
incorrect_rows = df[df['CORRECT'] == False]

percentage_correct_with_named_entities = (correct_rows['PLACES'].apply(lambda x: bool(x))).mean() * 100
percentage_incorrect_with_named_entities = (incorrect_rows['PLACES'].apply(lambda x: bool(x))).mean() * 100

print(f"The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER': {percentage_with_named_entities:.3f}%")
print(f"The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER' where CORRECT=True: {percentage_correct_with_named_entities:.3f}%")
print(f"The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER' where CORRECT=False: {percentage_incorrect_with_named_entities:.3f}%")


The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER': 13.589%
The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER' where CORRECT=True: 19.522%
The percentage of rows with a GPE/LOC named entity in 'REAL ANSWER' where CORRECT=False: 12.312%


In [61]:
# reading in original data from the respective seasons, not necessary but nice to have maybe

df = pd.read_csv('jeopardy_clue_dataset-master/seasons/season34.tsv', sep='\t')
df = pd.read_csv('jeopardy_clue_dataset-master/seasons/season16.tsv', sep='\t')
df = pd.read_csv('jeopardy_clue_dataset-master/combined_season1-39.tsv', sep='\t')
df

Unnamed: 0,round,clue_value,daily_double_value,category,comments,answer,question,air_date,notes
0,1,100,0,LAKES & RIVERS,,River mentioned most often in the Bible,the Jordan,1984-09-10,
1,1,200,0,LAKES & RIVERS,,Scottish word for lake,loch,1984-09-10,
2,1,400,0,LAKES & RIVERS,,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,
3,1,500,0,LAKES & RIVERS,,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,
4,1,100,0,INVENTIONS,,Marconi's wonderful wireless,a radio,1984-09-10,
...,...,...,...,...,...,...,...,...,...
468312,2,400,0,"\""CC"" ME",,"The American Cancer Society says, ""Stay away f...",tobacco,2023-07-28,
468313,2,800,0,"\""CC"" ME",,"In the 5 stages of grief, it comes last",acceptance,2023-07-28,
468314,2,1200,0,"\""CC"" ME",,It begins as a hot dry desert wind over northe...,a sirocco,2023-07-28,
468315,2,2000,0,"\""CC"" ME",,In medicine it's the complete or partial obstr...,occlusion,2023-07-28,
