In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import os
import re
from string import punctuation
from datetime import datetime
import json
import string

json_file_path = 'jeopardy.json'

english_stopwords = set(stopwords.words('english') + list(punctuation) + 
                       ['...', '..', '....', '``', "''", '/n'])

In [2]:
with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
def clean_wordlist(data):
    '''
    Takes a list of a dictionary, opens it and provides 
    a word list for all the information, lemmatized without stopwords.
    BeautifulSoup not used because json file? - everything already looked
    fairly separated.
    '''
    lemmatizer = WordNetLemmatizer()
    english_stopwords = set(stopwords.words('english'))
    
    # have to push everything into a string and lowercase...
    corpus = ''
    for item in data:
        for value in item.values():
            if isinstance(value, str):
                corpus += value.lower() + ' '
    
    words = word_tokenize(corpus)
    
    wordlist = [lemmatizer.lemmatize(word) for word in words if word not in english_stopwords and word not in string.punctuation]
    
    return wordlist


In [4]:
# confirm load successful
print(data[:10])

[{'category': 'HISTORY', 'air_date': '2004-12-31', 'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", 'value': '$200', 'answer': 'Copernicus', 'round': 'Jeopardy!', 'show_number': '4680'}, {'category': "ESPN's TOP 10 ALL-TIME ATHLETES", 'air_date': '2004-12-31', 'question': "'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'", 'value': '$200', 'answer': 'Jim Thorpe', 'round': 'Jeopardy!', 'show_number': '4680'}, {'category': 'EVERYBODY TALKS ABOUT IT...', 'air_date': '2004-12-31', 'question': "'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'", 'value': '$200', 'answer': 'Arizona', 'round': 'Jeopardy!', 'show_number': '4680'}, {'category': 'THE COMPANY LINE', 'air_date': '2004-12-31', 'question': '\'In 1963, live on "The Art Linkletter Show", this company served its billionth burger\'', 'value': '$200', 'answer': "McDonald\\'

In [5]:
type(data)

list

In [6]:
len(data)

216930

In [7]:
# create group to identify values
values = [entry['value'] for entry in data]

In [20]:
# check to confirm values grouped
print(values[:30])

['$200', '$200', '$200', '$200', '$200', '$200', '$400', '$400', '$400', '$400', '$400', '$400', '$600', '$600', '$600', '$600', '$600', '$600', '$800', '$800', '$800', '$800', '$2,000', '$800', '$1000', '$1000', '$1000', '$1000', '$1000', '$400']


In [11]:
# regex to find 800+ values
highval = [1 if x and re.search(r'\$(800|1000)', x) is not None else 0 for x in values]

In [12]:
print(highval[:30])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0]


This is not working as it should. We need to do something with values greater than 1000, as well as values with commas in them. Skipping for now to save time... and sanity.

In [13]:
# Confirm successful assignment of treatment and control
list(zip(data,highval))[17:30]

[({'category': '3-LETTER WORDS',
   'air_date': '2004-12-31',
   'question': "'A small demon, or a mischievous child (who might be a little demon!)'",
   'value': '$600',
   'answer': 'imp',
   'round': 'Jeopardy!',
   'show_number': '4680'},
  0),
 ({'category': 'HISTORY',
   'air_date': '2004-12-31',
   'question': "'Karl led the first of these Marxist organizational efforts; the second one began in 1889'",
   'value': '$800',
   'answer': 'the International',
   'round': 'Jeopardy!',
   'show_number': '4680'},
  1),
 ({'category': "ESPN's TOP 10 ALL-TIME ATHLETES",
   'air_date': '2004-12-31',
   'question': '\'No. 10: FB/LB for Columbia U. in the 1920s; MVP for the Yankees in \'27 & \'36; "Gibraltar in Cleats"\'',
   'value': '$800',
   'answer': '(Lou) Gehrig',
   'round': 'Jeopardy!',
   'show_number': '4680'},
  1),
 ({'category': 'EVERYBODY TALKS ABOUT IT...',
   'air_date': '2004-12-31',
   'question': "'Africa's lowest temperature was 11 degrees below zero in 1935 at Ifrane, 

In [14]:
allquestions = [ ' '.join(clean_wordlist([x])) for x in data ]

In [15]:
df = pd.DataFrame({'filename':data,
                 'smart':highval,
                  'text':allquestions})

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(df.text, df.smart,
                                                   random_state=1) 

In [17]:
tfidf_vectorize = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorize.fit_transform(X_train)
X_test_tf = tfidf_vectorize.transform(X_test)

In [18]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, Y_train)
predictions = naive_bayes.predict(X_test_tf)

In [19]:
print('Accuracy: ', accuracy_score(Y_test, predictions))

Accuracy:  0.7660465030516476
