In [1]:
import re
import nltk
import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plot
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis.sklearn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
regular_episodes = pd.read_csv("../data/jeopardy_regular_episodes.csv")
sample = regular_episodes.sample(frac=0.01)
text = regular_episodes['Question and Answer'].values.tolist()


In [3]:
regular_episodes.shape

(278730, 10)

In [None]:
def make_stopwords(filepath='../src/stopwords.txt'):
    """
    read in a list of stopwords from a .txt file
    and extend the nltk stopwords by this list.
    Return a list of stopwords created from nltk
    and the .txt file
    """
    sw = open(filepath, "r")
    my_stopwords = sw.read()
    my_stopwords = my_stopwords.split(", ")
    sw.close()

    all_stopwords = stopwords.words('english')
    all_stopwords.extend(my_stopwords)
    return all_stopwords

In [None]:

def remove_hypens(text):
    return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text)


# tokenize text
def tokenize_text(text):
    TOKEN_PATTERN = r'\s+'
    regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
    word_tokens = regex_wt.tokenize(text)
    return word_tokens


def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens


def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]


def remove_stopwords(tokens, stop_words):
    stopword_list = nltk.corpus.stopwords.words('english')
    stopword_list += stop_words
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens


def get_lemma(tokens):
    lemmas = []
    for word in tokens:
        lemma = wn.morphy(word)
        if lemma is None:
            lemmas.append(word)
        else:
            lemmas.append(lemma)
    return lemmas


def remove_short_tokens(tokens):
    return [token for token in tokens if len(token) > 3]


def keep_only_words_in_wordnet(tokens):
    return [token for token in tokens if wn.synsets(token)]


def apply_lemmatize(tokens, wnl=WordNetLemmatizer()):
    return [wnl.lemmatize(token) for token in tokens]


def clean_text_clues(texts):
    clean_clues = []
    for clue in texts:
        clue = remove_hypens(clue)
        clue_i = tokenize_text(clue)
        clue_i = remove_characters_after_tokenization(clue_i)
        clue_i = convert_to_lowercase(clue_i)
        clue_i = remove_stopwords(clue_i, stop_words)
        clue_i = get_lemma(clue_i)
        clue_i = remove_short_tokens(clue_i)
        clue_i = keep_only_words_in_wordnet(clue_i)
        clue_i = apply_lemmatize(clue_i)
        clean_clues.append(clue_i)
    return clean_clues

In [None]:
stop_words = make_stopwords()


In [None]:
clean_clues = clean_text_clues(text)
clean_clues_text = [' '.join(item) for item in clean_clues]


In [None]:
count_vectorizer = CountVectorizer(min_df=10, max_df=0.95, ngram_range=(1,2), stop_words=stop_words)
feature_matrix = count_vectorizer.fit_transform(clean_clues_text)

In [None]:
lda_model = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online', random_state=43,
                                     batch_size=128, evaluate_every=-1, n_jobs=-1)

In [None]:
lda_output = lda_model.fit_transform(feature_matrix)

In [None]:
display(lda_output) #output
display(lda_output.shape) #shape

In [None]:
lda_model.score(feature_matrix) #log-likelihood


In [None]:
lda_model.perplexity(feature_matrix) #perplexity 

In [None]:
# ADD MORE TO THE GRIDSEARCH PARAMETERS

search_params = {'n_components': [10, 12, 13, 15, 20, 25], 'learning_decay': [.5, .7, .9]}

In [None]:
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, search_params)


In [None]:
model.fit(feature_matrix)

In [None]:
best_lda_model = model.best_estimator_


In [None]:
print("Best model's params: ", model.best_params_)
print("Best log likelihood score: ", model.best_score_)
print("Model perplexity: ", best_lda_model.perplexity(feature_matrix))

In [None]:
df_cv_results = pd.DataFrame(model.cv_results_)
df_cv_results.to_csv("../data/LDAGridSearchResults.csv", header=True, index=False, encoding='utf-8')

In [None]:
sns.pointplot(x="param_n_components", y="mean_test_score", hue="param_learning_decay", data=df_cv_results)

In [None]:
# use this information to tell em the best hyperparameters for lda
best_lda_model

In [None]:
# change these hyperparameters to fit the best_lda_model info
LatentDirichletAllocation(learning_decay=.7,
             learning_method="batch", learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [None]:
lda_output = best_lda_model.transform(feature_matrix)

In [None]:
lda_output

In [None]:
# column names
topicnames = ['Topic_' + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ['Doc_' + str(i) for i in range(len(text))]

# create a dataframe
df_document_topic = pd.DataFrame(np.round(lda_output,2), columns=topicnames, index=docnames)

df_document_topic.head()

In [None]:
# dominant topic
df_document_topic['dominant_topic'] = np.argmax(df_document_topic.values, axis=1)
df_document_topic.head()

In [None]:
sns.countplot(df_document_topic.dominant_topic)

In [None]:
#mds = tsne
panel = pyLDAvis.sklearn.prepare(best_lda_model, feature_matrix, count_vectorizer, mds='tsne')
pyLDAvis.display(panel)

In [None]:
#MDS = PCoA
panel = pyLDAvis.sklearn.prepare(best_lda_model, feature_matrix, count_vectorizer, mds='PCoA')
pyLDAvis.display(panel)

In [None]:
# components_ contains the word to topic matrix
best_lda_model.components_.shape

In [None]:
# check the shape
feature_matrix.shape

In [None]:
# Topic - Keyword matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# assign column and index
df_topic_keywords.columns = count_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames


# check the head
df_topic_keywords.iloc[:,:10]

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=count_vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
topic_keywords = show_topics(count_vectorizer, best_lda_model, 20)

In [None]:
topic_keywords

In [None]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords