In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
import string

In [27]:
def clean_columns(df, col):
    '''
    using a pre-made function 
    returns a list of the tokenized and stripped of stopwords 
    '''
    text = ' '.join(df[col])
    tokens = word_tokenize(text)
    # converts the tokens to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    words = [word for word in stripped if word.isalpha()]
    
    # filter out stop words
    if col == 'notes':
        #TODO: add another set of stopwords for the notes
        remove_words = {'final', 'quarterfinal', 'game', 'jeopardy!', 'semifinal', 'round', 'tournament', 'week', 'reunion', 'ultimate', 'night', 'jeopardy', 'night', 'games'}
        stopwords_set = (set(stopwords.words('english'))).union(remove_words)
    else:
        stopwords_set = set(stopwords.words('english'))
    words = [w for w in words if not w in stopwords_set]
    return words

In [18]:
def make_q_and_a_col(df):
    """
    Makes a column that concatenates the strings
    from the question and answer columns

    Args:
        df (Pandas DataFrame): 
    Returns:
        Pandas DataFrame with an additional column
    """    
    df['question_and_answer'] = df["question"] + ' ' + df['answer']
    return df

def make_q_difficulty_col(df):
    conditions = [((df['value']<=600) & (df['daily_double']=='no')), #easy
                ((df['daily_double']=='no') & ((df['value']==800) | (df['value']==1200))), #average
                ((df['daily_double']== 'yes') & (df['round'] == 1)), #average
                ((df['daily_double']=='no') & ((df['value']==1000) | (df['value']>=1600))), #hard
                ((df['daily_double']== 'yes') & (df['round'] == 2)), #hard
                (df['round'] == 3)] # final jeopardy, hard 

    difficulties = ['easy', 'average', 'average', 'hard', 'hard', 'hard']

    df['question_difficulty'] = np.select(conditions, difficulties)
    return df

#TODO: write docstring
def update_df_columns(df):
    """[summary]

    Args:
        df ([type]): [description]

    Returns:
        [type]: [description]
    """    
    df_new = make_q_and_a_col(df)
    df_new = make_q_difficulty_col(df_new)
    return df_new

In [3]:
def read_tsv(filepath):
    """Reads in a tsv file

    Args:
        filepath (string): filepath and file name of the 
            tsv file to be read into as a pandas dataframe
    Returns:
        Pandas DataFrame
    """    
    return pd.read_csv(filepath, sep = "\t")



In [21]:
def make_train_test_sets(df, x_cols, y_col, test_size = .25, random_state = 123):
    """[summary]

    Args:
        df ([type]): [description]
        x_cols (string or list of strings): the columns to use as training features
        y_col (string): the column to use as the target 
        test_size (float, optional): size of the test set. Defaults to .25.
        random_state (int, optional): random state. Defaults to 123.

    Returns:
        tuple: X_train, X_test, y_train, y_test 
            X_train and X_test are either
            Pandas DataFrame if >1 columns are passed as features, 
            or Series object if 1 column is passed
            y_train, y_test are Series objects
            
    """    
    X = df[x_cols]
    y = df[y_col]
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=42)

    return X_train, X_test, y_train, y_test

In [22]:
def build_text_vectorizer(X_train):
    """[summary]
    Args:
        text (string or series): the text that will be fit to the 
                text_vectorizer whose words will be counted 
        use_tfidf (bool, optional): . Defaults to True.
        stop_words (string, optional). Defaults to 'english'
        use_stemmer (bool, optional): [description]. Defaults to False.
        max_features ([type], optional): [description]. Defaults to None.
    Returns:
    the a list of strings that are the words that appear in the text
    """       
#   count_vect = CountVectorizer()
    count_vect = CountVectorizer(ngram_range = (1, 2), 
                            lowercase=True, tokenizer=None, 
                            stop_words='english', analyzer='word',  
                            max_features=None)
    x_train_vectors = count_vect.fit_transform(X_train)
    # print count_vect.vocabulary_
    return x_train_vectors

In [61]:
#read in the dataset and build the training and testing on regular episodes 

jeopardy = read_tsv('../data/master_season1-35.tsv')
jeopardy_df = update_df_columns(jeopardy)
regular_episodes = jeopardy_df[jeopardy_df['notes']=='-']
special_tournament = jeopardy_df.drop(regular_episodes.index)

x_cols = 'question_and_answer'
y_col = 'question_difficulty'
X_train, X_test, y_train, y_test = make_train_test_sets(regular_episodes, x_cols, y_col, test_size = .25, random_state = 123)

X_train_sample = X_train.sample(frac = .1, axis = 0, random_state = 123)

In [65]:
#fit and transofrm the x_train, transofrm the x_test
count_vect = CountVectorizer(ngram_range = (1, 2), 
                            lowercase=True, tokenizer=None, 
                            stop_words='english', analyzer='word',  
                            max_features=None)

X_train_vectors = count_vect.fit_transform(X_train)
X_test_vectors = count_vect.transform(X_test)

In [None]:
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(X_train_vectors, y_train)
clf_svm.predict(X_test_vectors)