In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.decomposition import LatentDirichletAllocation, NMF

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
import string

In [29]:
def clean_columns(df, col):
    '''
    using a pre-made function 
    returns a list of the tokenized and stripped of stopwords 
    '''
    text = ' '.join(df[col])
    tokens = word_tokenize(text)
    # converts the tokens to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    words = [word for word in stripped if word.isalnum()]
    
    # filter out stop words
    if col == 'notes':
        #TODO: add another set of stopwords for the notes
        remove_words = {'final', 'quarterfinal', 'game', 'jeopardy!', 'semifinal', 'round', 'tournament', 'week', 'reunion', 'ultimate', 'night', 'jeopardy', 'night', 'games'}
        stopwords_set = (set(stopwords.words('english'))).union(remove_words)
    else:
        stopwords_set = set(stopwords.words('english'))
    words = [w for w in words if not w in stopwords_set]
    return words

In [30]:
def make_q_and_a_col(df):
    """
    Makes a column that concatenates the strings
    from the question and answer columns

    Args:
        df (Pandas DataFrame): 
    Returns:
        Pandas DataFrame with an additional column
    """    
    df['question_and_answer'] = df["question"] + ' ' + df['answer']
    return df

def make_q_difficulty_col(df):
    conditions = [((df['value']<=600) & (df['daily_double']=='no')), #easy
                ((df['daily_double']=='no') & ((df['value']==800) | (df['value']==1200))), #average
                ((df['daily_double']== 'yes') & (df['round'] == 1)), #average
                ((df['daily_double']=='no') & ((df['value']==1000) | (df['value']>=1600))), #hard
                ((df['daily_double']== 'yes') & (df['round'] == 2)), #hard
                (df['round'] == 3)] # final jeopardy, hard 

    difficulties = ['easy', 'average', 'average', 'hard', 'hard', 'hard']

    df['question_difficulty'] = np.select(conditions, difficulties)
    return df

#TODO: write docstring
def update_df_columns(df):
    """[summary]

    Args:
        df ([type]): [description]

    Returns:
        [type]: [description]
    """    
    df_new = make_q_and_a_col(df)
    df_new = make_q_difficulty_col(df_new)
    return df_new

In [31]:
def read_tsv(filepath):
    """Reads in a tsv file

    Args:
        filepath (string): filepath and file name of the 
            tsv file to be read into as a pandas dataframe
    Returns:
        Pandas DataFrame
    """    
    return pd.read_csv(filepath, sep = "\t")



In [42]:
#TODO: write docstring
def make_sub_df(df, fraction = .05, state = 123):
    """[summary]

    Args:
        df ([type]): [description]
        fraction (float, optional): [description]. Defaults to .05.
        state (int, optional): [description]. Defaults to 123.

    Returns:
        [type]: [description]
    """
    return df.sample(frac = fraction, axis = 0, random_state = state)


In [43]:
jeopardy_df = read_tsv('../data/master_season1-35.tsv')
jeopardy_df = update_df_columns(jeopardy_df)
regular_episodes = jeopardy_df[jeopardy_df['notes']=='-']
special_tournaments = jeopardy_df.drop(regular_episodes.index)

regular_episode_sub = make_sub_df(regular_episodes)

## KMEANS

### fit a countvectorizer over the training sample

In [35]:
#fit and transofrm the x_train, transofrm the x_test
#adjust the hyper parameters 
count_vect = CountVectorizer(ngram_range = (1, 1), 
                            lowercase=True,  tokenizer=None, 
                            stop_words='english', analyzer='word',  
                              max_features=None)

#WHAT DO I PUT IN THE FIT_TRANSFORM
x = count_vect.fit_transform(X_train_sample)
# x.toarray() or x.todense ??
features = count_vect.get_feature_names()
# count_vect.vocabulary_

In [36]:
#can do the same as above with the categories 

#TODO: do it here

#### Use kmeans clustering

In [37]:
from sklearn.cluster import KMeans


In [49]:
def kmeans_cluster(df, col, n):
    """[summary]

    Args:
        df ([type]): [description]
        n ([type]): [description]

    Returns:
        [type]: [description]
    """    
    #adjust the hyperparameters
    count_vect = CountVectorizer(ngram_range = (1, 1), 
                            lowercase=True,  tokenizer=None, 
                            stop_words='english', analyzer='word',  
                            max_features=None)

    x = count_vect.fit_transform(df[col])
    # features = count_vect.get_feature_names()
    kmeans = KMeans(n_clusters = 10, random_state = 123).fit(x)
    centroids = kmeans.cluster_centers_
    top_n = np.argsort(centroids)[:, :-n+1:-1]
    names = count_vect.get_feature_names()

    name_arr = np.array(names)
    return f'n = {n}', name_arr[top_n]

# kmeans_cluster(regular_episode_sub, 'question_and_answer', 10)
#This did kind of a terrible job 

## NEXT STEPS 1
- Look at the `modeling with nmf` and case study code
- use a tdidf transformer 
- pass it into an NMF to get out soft clusters


## Use NMF


In [167]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [168]:
n_samples = 2000
n_features = 100
n_topics = 10
n_top_words = 20

In [169]:
regular_episode_sub['question_and_answer']
regular_episode_reindexed = regular_episode_sub.set_index('category')
regular_episode_reindexed

Unnamed: 0_level_0,round,value,daily_double,comments,answer,question,air_date,notes,question_and_answer,question_difficulty
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
WHAT'S WRONG WITH YOU?,2,800,no,-,Underactivity of this butterfly-shaped gland s...,Thyroid gland,2001-02-23,-,Thyroid gland Underactivity of this butterfly-...,average
GEOMETRY,2,2000,no,-,(Cheryl of the Clue Crew in front of a blackbo...,eccentric,2003-10-03,-,eccentric (Cheryl of the Clue Crew in front of...,hard
BIBLE PEOPLE MAGAZINE,1,400,no,(Alex: That sounds like fun!),Star Tracks says that scholars believe these s...,Magi/three wise men,2003-06-24,-,Magi/three wise men Star Tracks says that scho...,easy
ODE TO ENGLAND,1,800,no,-,Headquarters was needed for Peel's new police ...,Scotland Yard,2003-04-01,-,Scotland Yard Headquarters was needed for Peel...,average
"\""C"" IN SCIENCE",2,2000,no,-,"In order to duplicate itself in cell division,...",chromosomes,2003-09-10,-,chromosomes In order to duplicate itself in ce...,hard
...,...,...,...,...,...,...,...,...,...,...
BY THE SEASHORE,1,400,no,-,The dried skeleton of this invertebrate resemb...,Sand Dollar,1996-12-04,-,Sand Dollar The dried skeleton of this inverte...,easy
OPERA,2,800,no,-,"Euridice is a soprano role in Monteverdi's ""Th...",Orpheus,1997-07-14,-,Orpheus Euridice is a soprano role in Montever...,average
FURNITURE,1,300,no,-,"A long seat designed for 2 or more people, it ...",settee,1989-10-12,-,settee A long seat designed for 2 or more peop...,easy
RINGING THE OPENING BELL AT THE NYSE,1,400,no,-,"On Oct. 11, 2007 this chairman of the Virgin G...",(Richard) Branson,2008-07-22,-,"(Richard) Branson On Oct. 11, 2007 this chairm...",easy


In [170]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')

tfidf = vectorizer.fit_transform(regular_episode_reindexed['question_and_answer'])
# tfidf.toarray().shape
# vectorizer.vocabulary_.keys()

In [171]:
nmf = NMF(n_components=n_topics, random_state=123)
nmf.fit(tfidf)

W = nmf.transform(tfidf)
H = nmf.components_

In [172]:
topics = ['latent_topic_{}'.format(i) for i in range(n_topics)]
# idx= regular_episode_sub['category'] --> change this 
idx =regular_episode_reindexed.index
col = vectorizer.vocabulary_.keys()

W = pd.DataFrame(W, index = idx, columns = topics)
H = pd.DataFrame(H, index = topics, columns = col)

W,H = (np.around(x, 2) for x in (W, H))

# print(W.head(30), '\n\n', H.head(n_topics))

In [173]:
feature_names = vectorizer.get_feature_names()
# feature_names

In [174]:
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

print (f'reconstruction error: {nmf.reconstruction_err_}')

Topic #0:
city new capital york largest world home founded war french famous west st south old north american born said island

Topic #1:
man said george time big old born woman book president founded tv song term french author people got life james

Topic #2:
state new capital island people york west national north largest south river 000 ll lake home set white term year

Topic #3:
country king south world war largest island great american people capital west hit national years day president north century sea

Topic #4:
named american president island company founded river born south century red family national north french used year greek henry little

Topic #5:
like just said wrote hit old says big don author make novel means tv life james 10 letter george seen

Topic #6:
called used new great group british book william long famous life king meaning century river work don island latin year

Topic #7:
type used day seen make work war known time white star black letter river red term 

In [175]:
def nm_factorize(df, col, n_features, n_topics, n_top_words):
    """[summary]

    Args:
        df ([type]): [description]
        col ([type]): [description]
        n_features ([type]): [description]
        n_topics ([type]): [description]

    Returns:
        [type]: [description]
    """ 

    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')
    tfidf = vectorizer.fit_transform(df[col])

    nmf = NMF(n_components=n_topics, random_state=123)
    nmf.fit(tfidf)

    W = nmf.transform(tfidf)
    H = nmf.components_
    feature_names = vectorizer.get_feature_names()

    topics = ['latent_topic_{}'.format(i) for i in range(n_topics)]
    idx = df.index
    col = vectorizer.vocabulary_.keys()

    W = pd.DataFrame(W, index = idx, columns = topics)
    H = pd.DataFrame(H, index = topics, columns = col)

    W,H = (np.around(x, 2) for x in (W, H))

    for topic_idx, topic in enumerate(nmf.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()
    print (f'RECONSTRUCTION ERROR: {nmf.reconstruction_err_}')
#     print ()
#     print (W.head(30), '\n\n', H.head(n_topics))

    return None


In [176]:
regular_episodes_reindexed = regular_episodes.set_index('category')
n_samples = 2000
n_features = 100
n_topics = 10
n_top_words = 20

nm_factorize(regular_episodes_reindexed, 'question_and_answer', n_features, n_topics, n_top_words)

Topic #0:
city capital largest home known world river st 000 famous york born south french north west people national years war

Topic #1:
man john film said title wrote played king president novel years george book born known james old time won play

Topic #2:
country king world south american war president island known years largest capital national people born north great 000 west home

Topic #3:
called book great american people novel john french group little king wrote known white world war life century game sea

Topic #4:
state capital south north island river home largest national west president known house 000 years war george born 1st said

Topic #5:
type known letter seen make white red water comes long latin house black work french says greek don 000 book

Topic #6:
named american french island john king century president british river company year woman known james war born character capital national

Topic #7:
like said just term wrote means don title song film group says 

#### Choosing K:
- Plot the reconstruction error for different values of k (elbow plot).
- Look at the cosine similarity of items within topics (should be similar) and
between topics (should be dissimilar).. If the score is 1: Same orientation, score 0: less similar 

## LDA

In [182]:
# Instantiate the LDA model
count_vectorizer = CountVectorizer(min_df=10, max_df=0.95, ngram_range=(1,1), stop_words='english')
feature_matrix = count_vectorizer.fit_transform(regular_episodes_sub['question_and_answer'])
    
lda_model = LatentDirichletAllocation(n_components=2, max_iter=100, learning_method='online', random_state=43,
                                     batch_size=128, evaluate_every=-1, n_jobs=-1)

# fit transform the feature matrix
lda_output = lda_model.fit_transform(feature_matrix)

# display the lda_output and its shape
lda_output

KeyboardInterrupt: 

In [None]:
lda_output.shape

## From the case study 

In [66]:
def get_word_indices(df, col_name):
    words = df[col_name].values
    count_vect = CountVectorizer(lowercase=True, tokenizer=None, stop_words='english',
                             analyzer='word', max_df=1.0, min_df=1,
                             max_features=None)
    # count_vect = CountVectorizer(ngram_range = (1, 2), use_tfidf=True, lowercase=True, 
    #                             use_stemmer=False, tokenizer=None, stop_words='english',  
    #                             max_features=None)
    count_vect.fit(words)
    count_vect.transform(words)
    return count_vect.vocabulary_ 

In [67]:
def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
    return hand_labels


In [68]:
def get_topics(df, col, num):
    '''
    col_name (str): input the column name we want to get the latent topics of 
    num (int): number of topics we want to get
    '''
    words = df[col].values
    vectorizer = TfidfVectorizer(stop_words = 'english', strip_accents = 'ascii', ngram_range=(1, 2), 
                                 lowercase = True, preprocessor = clean_columns(df, col))
    vectorizer.fit_transform(words)
    vectorizer.vocabulary_
    vocabulary = vectorizer.get_feature_names()
    vocabulary = np.array(vocabulary)
    

    nmf_model = NMF(n_components=num, max_iter=100, random_state=12345, alpha=0.0)
    W = nmf_model.fit_transform(words)
    H = nmf_model.components_
    print('reconstruction error:', nmf_model.reconstruction_err_)

    return hand_label_topics(H, vocabulary)

#### Use SVD or PCA

In [24]:
# plt.show(s[:10])

num_top_words = 8
def show_topics(a):
    top_words = lambda t : [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

#show_topics(v[:10])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

## NEXT STEPS 2
- look at the `clustering` assignments and the pdfs
- kmean score
- silhouette score
- try MiniBatchKMean 
- heirarchical clustering 
- change the above work so that instead of index, it's printing the category

<p>
    
- output of vectorizer is the array of the bag of words
- vocabulary attribute gives word mapped to the index 

#### Soft Cluster
- get a TFIDF matrix
- pass the TFIDF as a feature to the NMF 
- when I print whats in a topic, I want to print the 'category' of the question clas associated with the loadings on that 'category'
- identify some outliers



## NEXT STEPS 3
- Read through the naive bayes notes, documentation, asssignments
- write a Naive Bayes clasifier to classify easy, hard, average questions
- cross validate / score 

## topics to keep track of

- clustering (k-means, heirarchical)
- pca 
- svd
- nmf

In [None]:


#### Naive Bayes
- Construct a naive bayes on the words