# Project Description

Jeopardy is a popular TV show. Where the participants answer questions to win money. In this project the Jeopardy are analyzed and patterns in the question dataset are searched to gain an edge.

The dataset used was created by the user trexmatt on Reddit. It is based on data crawled from the site [www.j-archive.com](www.j-archive.com). It contains 216,930 questions of the show Jeopardy and can be downloaded from [Google Drive](https://drive.google.com/file/d/0BwT5wj_P7BKXUl9tOUJWYzVvUjA/view)

# Import Libaries

In [1]:
import os
import io
import requests

from IPython.display import display

import pandas as pd
import numpy as np

import string as string

from nltk.corpus import stopwords
from nltk.stem import *
from nltk.tokenize import RegexpTokenizer

from scipy.stats import chisquare

# Set Global Varibales

In [2]:
URL = 'https://drive.google.com/uc?export=download&id=0BwT5wj_P7BKXUl9tOUJWYzVvUjA'
DATASET_NAME = 'jeopardy.csv'
PATH_DATASET = 'data/' + DATASET_NAME

# Project Preparation

## Download the data

In [3]:
def download_csv_data(url, filename):
    """Download a csv file and stores it in the data folder of the project repository.

    Args:
        URL of the csv file

    Returns:
        None
    """
    ### Create data dir if not exts
    if not os.path.exists('data/'):
        os.makedirs('data/')
        
    ### Split the file id from the url
    id = url.rsplit('=')[-1]

    session = requests.Session()

    response = session.get(url, stream = True)
    token = get_confirm_token(response)

    if token:
        
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, filename)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

"""
Downloads the data to the data folder of a local repository after you run it once you can uncomment this lines.
To prevent the code from downloading the data every time you run the code.
"""                
download_csv_data(URL, DATASET_PATH)

## Load the dataset

In [4]:
### Load the data as df in the var data
data = pd.read_csv(PATH_DATASET)
### Inspect the first 5 rows of the dataset
display(data.head())

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


## Data Preprocessing

### Remove leading spaces in column names

In [5]:
### Inspect the column names
print(data.columns)

### Remove spaces
data.columns = [x.replace(' ', '') for x in data.columns]

### Inspect the column names
print(data.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')
Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


### Remove punctuation

In [6]:
def remove_punctuation(s):
    """Removes punctuation from a given string.

    Args:
        String

    Returns:
        String without punctuation
    """
    ### Intiate translator object
    translator = str.maketrans('', '', string.punctuation)

    ### remove punctuation 
    s = s.translate(translator)

    return s

In [7]:
#data['clean_question'] = data['Question'].apply(remove_punctuation)
#data['clean_answer'] = data['Answer'].apply(remove_punctuation)
data['clean_value'] = data['Value'].apply(remove_punctuation)

### Inspect the first 5 rows of the dataset
display(data.head())

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200


### Set NAN values in column clean_value to zero

In [8]:
### Inspect the column clean_value values
print(len([ x for x  in data['clean_value'].values if x == 'None']))

### Replace None by 0
data['clean_value'].replace('None', '0',inplace=True)

### Convert the clean_value column to int
data['clean_value'] = data['clean_value'].astype(int)

### Inspect the column clean_value values
print(len([ x for x  in data['clean_value'].values if x == 'None']))

3634
0


### Convert column AirDate to type datatime

In [9]:
### Convert column to datetime
data['AirDate'] = pd.to_datetime(data['AirDate'])

### Inspect the first 5 rows of the dataset
display(data.head())

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200


### NLP Preprocessing of the Question and Answer columns

In [10]:
def lang_processing(textstring):
    """Executes general language processing on a given string.
        - Remove stopwords
        - Stemming
        - Tokenization
        - Casfolding

    Args:
        String

    Returns:
        Preprocessed language String
    """
    if isinstance(textstring, list):
        textstring = ' '.join(textstring)
        #return 'None'
    
    cachedStopWords = stopwords.words("english")
    ### Bug in PorterStemmer in NLTK 3.2.2 switched to SnowballStemmer
    #stemmer = PorterStemmer()
    stemmer = SnowballStemmer("english")
    tokenizer = RegexpTokenizer(r'\w+')

    tokens = tokenizer.tokenize(textstring.lower())
    #print(tokens)
    tokens_stopped = [token for token in tokens if token not in cachedStopWords]
    tokens_stemmed = [stemmer.stem(token) for token in tokens_stopped]

    processed = ' '.join(tokens_stemmed)
    
    return tokens_stemmed

In [11]:
data['clean_question'] = data['Question'].apply(lang_processing)
data['clean_answer'] = data['Answer'].apply(lang_processing)

### Inspect the first 5 rows of the dataset
display(data.head())

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_value,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200,"[last, 8, year, life, galileo, hous, arrest, e...",[copernicus]
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200,"[2, 1912, olympian, footbal, star, carlisl, in...","[jim, thorp]"
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200,"[citi, yuma, state, record, averag, 4, 055, ho...",[arizona]
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200,"[1963, live, art, linklett, show, compani, ser...",[mcdonald]
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200,"[signer, dec, indep, framer, constitut, mass, ...","[john, adam]"


# Analysis

## Count how often the answer is in the question

In [12]:
def count_answer_in_question(series):
    """Counts how often the answer is contained in the question

    Args:
        Series

    Returns:
        Int count of occurence
    """
    if len(series['clean_answer']) == 0:
        return 0
    
    match_count = 0
    for item in series['clean_answer']:
        if item in series['clean_question']:
            match_count += 1
    
    return match_count/len(series['clean_answer'])

In [13]:
### Compute how often the answer is in the question
data['answer_in_question'] = data.apply(count_answer_in_question, axis=1)
### Inspect the average occurence count
print(data['answer_in_question'].mean())

0.04271233952052585


Just in about 4% of the cases the answer can deduced from the question.  
Therefore it would not a reasonable strategie to just try to deduce the answer from the question.

## Count how often a question is reused

In [14]:
### Sort the dataframe 
data.sort_values(by='AirDate', inplace=True)

### Intizate a var for terms used and the question overlap
question_overlap = []
terms_used = set()


for key, row in data.iterrows():
    
    match_count = 0
    
    for word in row['clean_question']:
        if word in terms_used:
            match_count += 1
        else:
            terms_used.add(word)
    
    if len(row['clean_question']) > 0:
            match_count =  match_count / len(row['clean_question'])
    
    question_overlap.append(match_count)

data['question_overlap'] = question_overlap

print(data['question_overlap'].mean())    

0.9594273875526782


There is about 94% overlap between terms in new questions and terms in old questions.
But that is not very significant because just single terms are compared, but this could be an starting point for further investigation.

## Classify questions in high value and low value question

In [15]:
def classify_question(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

data['high_value'] = data.apply(classify_question, axis=1)

print(data['high_value'].value_counts())

0    155508
1     61422
Name: high_value, dtype: int64


The classsification of the questions shows that 61422 of 216930 questions have a value over 800.

## Count how often a term occurs in high value and low value questions

In [16]:
def term_count_value(word, data):
    low_count = 0
    high_count = 0
    
    for key, row in data.iterrows():
        #split_question = row['clean_question'].split(' ')
        
        if word in row['clean_question']:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return (high_count, low_count)

observed_expected = []
### Just use the first five terms
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(term_count_value(term, data))
    
print(observed_expected)

[(1, 1), (19, 34), (1, 0), (1, 2), (39, 91)]


As a result we get the observed count for the used term

## Count the expected counts and perform a Chi-square test

In [17]:
### Supress numpy waring of dividing by zero
np.seterr(divide='ignore', invalid='ignore')

### Store the value counts to vars
high_value_count = data['high_value'].value_counts()[1]
low_value_count = data['high_value'].value_counts()[0]

chi_squared = []

### Loop over observed values compute expected count and perfrom chi square test
for observed in observed_expected:
    total = sum(observed)
    total_prop = total / data.shape[0]
    
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([observed[0], observed[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    
print(chi_squared)

[Power_divergenceResult(statistic=0.46338644448358013, pvalue=0.49604555208958945), Power_divergenceResult(statistic=1.4824773958992283, pvalue=0.2233873126613255), Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751), Power_divergenceResult(statistic=0.037234093889071389, pvalue=0.846989214486915), Power_divergenceResult(statistic=0.18201894105004199, pvalue=0.66964390503326565)]


None of the terms has a significant usage in high or low value question. Therfore, a preperation for the Jeopardy show based on specific high vlaue terms is not practicable.