In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
import string

In [61]:
def clean_columns(df, col):
    '''
    using a pre-made function 
    returns a list of the tokenized and stripped of stopwords 
    '''
    text = ' '.join(df[col])
    tokens = word_tokenize(text)
    # converts the tokens to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    words = [word for word in stripped if word.isalnum()]
    
    # filter out stop words
    if col == 'notes':
        #TODO: add another set of stopwords for the notes
        remove_words = {'final', 'quarterfinal', 'game', 'jeopardy!', 'semifinal', 'round', 'tournament', 'week', 'reunion', 'ultimate', 'night', 'jeopardy', 'night', 'games'}
        stopwords_set = (set(stopwords.words('english'))).union(remove_words)
    else:
        stopwords_set = set(stopwords.words('english'))
    words = [w for w in words if not w in stopwords_set]
    return words

In [28]:
def make_q_and_a_col(df):
    """
    Makes a column that concatenates the strings
    from the question and answer columns

    Args:
        df (Pandas DataFrame): 
    Returns:
        Pandas DataFrame with an additional column
    """    
    df['question_and_answer'] = df["question"] + ' ' + df['answer']
    return df

def make_q_difficulty_col(df):
    conditions = [((df['value']<=600) & (df['daily_double']=='no')), #easy
                ((df['daily_double']=='no') & ((df['value']==800) | (df['value']==1200))), #average
                ((df['daily_double']== 'yes') & (df['round'] == 1)), #average
                ((df['daily_double']=='no') & ((df['value']==1000) | (df['value']>=1600))), #hard
                ((df['daily_double']== 'yes') & (df['round'] == 2)), #hard
                (df['round'] == 3)] # final jeopardy, hard 

    difficulties = ['easy', 'average', 'average', 'hard', 'hard', 'hard']

    df['question_difficulty'] = np.select(conditions, difficulties)
    return df

#TODO: write docstring
def update_df_columns(df):
    """[summary]

    Args:
        df ([type]): [description]

    Returns:
        [type]: [description]
    """    
    df_new = make_q_and_a_col(df)
    df_new = make_q_difficulty_col(df_new)
    return df_new

In [29]:
def read_tsv(filepath):
    """Reads in a tsv file

    Args:
        filepath (string): filepath and file name of the 
            tsv file to be read into as a pandas dataframe
    Returns:
        Pandas DataFrame
    """    
    return pd.read_csv(filepath, sep = "\t")



In [30]:
def make_train_test_sets(df, x_cols, y_col, test_size = .25, random_state = 123):
    """[summary]

    Args:
        df ([type]): [description]
        x_cols (string or list of strings): the columns to use as training features
        y_col (string): the column to use as the target 
        test_size (float, optional): size of the test set. Defaults to .25.
        random_state (int, optional): random state. Defaults to 123.

    Returns:
        tuple: X_train, X_test, y_train, y_test 
            X_train and X_test are either
            Pandas DataFrame if >1 columns are passed as features, 
            or Series object if 1 column is passed
            y_train, y_test are Series objects
            
    """    
    X = df[x_cols]
    y = df[y_col]
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=42)

    return X_train, X_test, y_train, y_test

In [31]:
def build_text_vectorizer(X_train):
    """[summary]
    Args:
        text (string or series): the text that will be fit to the 
                text_vectorizer whose words will be counted 
        use_tfidf (bool, optional): . Defaults to True.
        stop_words (string, optional). Defaults to 'english'
        use_stemmer (bool, optional): [description]. Defaults to False.
        max_features ([type], optional): [description]. Defaults to None.
    Returns:
    the a list of strings that are the words that appear in the text
    """       
#   count_vect = CountVectorizer()
    count_vect = CountVectorizer(ngram_range = (1, 2), 
                            lowercase=True, tokenizer=None, 
                            stop_words='english', analyzer='word',  
                            max_features=None)
    x_train_vectors = count_vect.fit_transform(X_train)
    # print count_vect.vocabulary_
    return x_train_vectors

In [80]:
#read in the dataset and build the training and testing on regular episodes 

jeopardy = read_tsv('../data/master_season1-35.tsv')
jeopardy_df = update_df_columns(jeopardy)
regular_episodes = jeopardy_df[jeopardy_df['notes']=='-']
special_tournament = jeopardy_df.drop(regular_episodes.index)
regular_episodes

x_cols = 'question_and_answer'
y_col = 'question_difficulty'
X_train, X_test, y_train, y_test = make_train_test_sets(regular_episodes, x_cols, y_col, test_size = .25, random_state = 123)

X_train_sample = X_train.sample(frac = .1, axis = 0, random_state = 123)

## KMEANS

In [91]:
#fit and transofrm the x_train, transofrm the x_test
count_vect = CountVectorizer(ngram_range = (1, 1), 
                            lowercase=True,  tokenizer=None, 
                            stop_words='english', analyzer='word',  
                            max_features=None)

x = count_vect.fit_transform(X_train_sample)
# x.toarray()
features = count_vect.get_feature_names()
# count_vect.vocabulary_

In [92]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 10, random_state = 123).fit(x)

In [93]:
centroids = kmeans.cluster_centers_
top_10 = np.argsort(centroids)[:, :-11:-1]
top_10

array([[26406, 10887, 16402, 14955,   327, 29538,  7375,  4099, 31199,
           22],
       [ 6248,  7355, 19421, 17641,  5130, 19698, 16845, 11074, 29538,
        15426],
       [  665, 18441, 27423, 24796, 15433, 31162, 29736, 16128, 11975,
        28739],
       [20004, 28791,  2824, 31281,  3268, 28091,  5850, 22823, 16895,
        28596],
       [29898, 16044, 29538, 31199,  3697,  6248,  5130,  3721, 19421,
        17562],
       [22429, 30195, 17641,  7355, 15426, 15298, 24796,  9742, 20248,
        24470],
       [30574,  6250, 31218,  3338, 14442, 24070, 26770, 21217, 31401,
         1920],
       [28412, 20308, 31199, 31396, 16845, 16728, 29538, 26292, 11675,
        17641],
       [19505, 16322, 15426,  2093,  4694, 14131,  5197,  6299,  3577,
         4068],
       [27159,  6248, 19698,  5291, 19421,  1669, 19946, 11341, 21011,
         5119]])

In [94]:
names = count_vect.get_feature_names()

name_arr = np.array(names)
name_arr[top_10]

array([['small', 'feet', 'large', 'intestine', '17', 'type', 'court',
        'boat', 'word', '10'],
       ['city', 'country', 'named', 'man', 'called', 'new', 'like',
        'film', 'type', 'john'],
       ['2004', 'meryl', 'streep', 'said', 'joined', 'women', 'union',
        'kramer', 'games', 'time'],
       ['novel', 'title', 'author', 'wrote', 'based', 'tale',
        'character', 'published', 'line', 'thomas'],
       ['used', 'known', 'type', 'word', 'best', 'city', 'called',
        'better', 'named', 'make'],
       ['president', 'vice', 'man', 'country', 'john', 'jefferson',
        'said', 'elected', 'office', 'roosevelt'],
       ['war', 'civil', 'world', 'battle', 'ii', 'revolutionary',
        'spanish', 'peace', 'years', 'american'],
       ['term', 'old', 'word', 'year', 'like', 'letter', 'type', 'slang',
        'french', 'man'],
       ['national', 'lampoon', 'john', 'animal', 'brothers', 'house',
        'campus', 'classic', 'belushi', 'blues'],
       ['state', '

## NEXT STEPS 1
- Look at the `modeling with nmf` and case study code
- use a tdidf transformer 
- pass it into an NMF to get out soft clusters


## NEXT STEPS 2
- look at the `clustering` assignments and the pdfs
- kmean score
- silhouette score
- try MiniBatchKMean 
- heirarchical clustering 
- change the above work so that instead of index, it's printing the category

## NEXT STEPS 3
- Read through the naive bayes notes, documentation, asssignments
- write a Naive Bayes clasifier to classify easy, hard, average questions
- cross validate / score 

## topics to keep track of

- clustering (k-means, heirarchical)
- pca 
- svd
- nmf

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

# just do term frequency at first, use_idf = False
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer.fit(X_train_vectors)
X_train_tf = tf_transformer.transform(X_train_vectors)
words_tf = X_train_tf
words_tf

<186749x1172622 sparse matrix of type '<class 'numpy.float64'>'
	with 3226460 stored elements in Compressed Sparse Row format>

### Clustering from case study

In [37]:
def get_word_indices(df, col_name):
    words = df[col_name].values
    count_vect = CountVectorizer(lowercase=True, tokenizer=None, stop_words='english',
                             analyzer='word', max_df=1.0, min_df=1,
                             max_features=None)
    # count_vect = CountVectorizer(ngram_range = (1, 2), use_tfidf=True, lowercase=True, 
    #                             use_stemmer=False, tokenizer=None, stop_words='english',  
    #                             max_features=None)
    count_vect.fit(words)
    count_vect.transform(words)
    return count_vect.vocabulary_ 

In [38]:
def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
    return hand_labels


In [66]:
def get_topics(df, col, num):
    '''
    col_name (str): input the column name we want to get the latent topics of 
    num (int): number of topics we want to get
    '''
    words = df[col].values
    vectorizer = TfidfVectorizer(stop_words = 'english', strip_accents = 'ascii', ngram_range=(1, 2), 
                                 lowercase = True, preprocessor = clean_columns(df, col))
    vectorizer.fit_transform(words)
    vectorizer.vocabulary_
    vocabulary = vectorizer.get_feature_names()
    vocabulary = np.array(vocabulary)
    

    nmf_model = NMF(n_components=num, max_iter=100, random_state=12345, alpha=0.0)
    W = nmf_model.fit_transform(words)
    H = nmf_model.components_
    print('reconstruction error:', nmf_model.reconstruction_err_)

    return hand_label_topics(H, vocabulary)

## Naive Bayes