In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import re
import wrangle

pd.options.display.max_colwidth = None
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings('ignore')

plt.rc('figure', figsize=(20,10))

import nltk

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
# pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn

import pickle

In [15]:
hotel = wrangle.wrangle_hotel()

Using cached file...


# NLP: Topic Modeling

## SKLearn

### Positive topics

In [16]:
data = hotel.positive_lemma.tolist()

In [17]:
vectorizer = TfidfVectorizer(min_df=10,
                            stop_words='english',
                            token_pattern='[a-zA-Z0-9]{3,}',
                            ngram_range =(2,3))

In [18]:
data_vectorized = vectorizer.fit_transform(data)

In [19]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components = 7,
                                      learning_decay = 0.7,
                                      learning_method='online',   
                                      random_state=172,
                                      n_jobs = -1)

In [None]:
lda_output = lda_model.fit_transform(data_vectorized)

In [None]:
# with open('lda_output.csv', 'wb') as f:
#     pickle.dump(lda_output, f)
    
# with open('lda_output.csv', 'rb') as f:
#     lda_output = pickle.load(f)

In [None]:
# # Materialize the sparse data
# data_dense = data_vectorized.todense()

In [None]:
# # Compute Sparsicity = Percentage of Non-Zero cells
# print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [None]:
# # Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

In [None]:
# # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

In [None]:
# See model parameters
# pprint(lda_model.get_params())

In [None]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

In [None]:
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

In [None]:
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

In [None]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [None]:
dom_top = pd.DataFrame(dominant_topic)
dom_top.to_csv('dominant_topic.csv')

In [None]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

In [None]:
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
df_document_topics

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")

In [None]:
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

In [None]:
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

In [None]:
# View
df_topic_keywords.head()

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15)        

In [None]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [None]:
df_topic_keywords.to_csv('pos_topic_keyword.csv')

In [None]:
# Construct the k-means clusters
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=15, random_state=100).fit_predict(lda_output)

In [None]:
# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components

In [None]:
lda_output_svd = svd_model.fit_transform(lda_output)

In [None]:
# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

In [None]:
# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))

In [None]:
# Plot
plt.figure(figsize=(12, 12))
plt.scatter(x, y, c=clusters)
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters", )


### Negative Topics

In [None]:
negative_data = hotel.negative_lemma.tolist()

In [None]:
negative_vectorizer = TfidfVectorizer(min_df=10,
                             stop_words='english',
                             token_pattern='[a-zA-Z0-9]{3,}',
                                     ngram_range =(2,3))

In [None]:
negative_data_vectorized = negative_vectorizer.fit_transform(negative_data)

In [None]:
#Build LDA Model
negative_lda_model = LatentDirichletAllocation(n_components = 7,
                                               learning_decay = 0.7,
                                               learning_method='online',   
                                               random_state=172,
                                               n_jobs = -1)

In [None]:
negative_lda_output = negative_lda_model.fit_transform(negative_data_vectorized)

In [None]:
# with open('negative_lda_output.csv', 'wb') as f:
#     pickle.dump(negative_lda_output, f)
    
# with open('negative_lda_output.csv', 'rb') as f:
#     negative_lda_output = pickle.load(f)

In [None]:
# # Materialize the sparse data
# negative_data_dense = negative_data_vectorized.todense()

In [None]:
# # Compute Sparsicity = Percentage of Non-Zero cells
# print("Sparsicity: ", ((negative_data_dense > 0).sum()/negative_data_dense.size)*100, "%")

In [None]:
# # Log Likelyhood: Higher the better
print("Log Likelihood: ", negative_lda_model.score(negative_data_vectorized))

In [None]:
# # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", negative_lda_model.perplexity(negative_data_vectorized))

In [None]:
# # See model parameters
pprint(negative_lda_model.get_params())

In [None]:
# Create Document - Topic Matrix
negative_lda_output = negative_lda_model.transform(negative_data_vectorized)

In [None]:
# column names
negative_topicnames = ["Topic" + str(i) for i in range(negative_lda_model.n_components)]

In [None]:
# index names
negative_docnames = ["Doc" + str(i) for i in range(len(negative_data))]

In [None]:
# Make the pandas dataframe
negative_df_document_topic = pd.DataFrame(np.round(negative_lda_output, 2), columns=negative_topicnames, index=negative_docnames)

In [None]:
# negative_lda_output.head()

In [None]:
# Get dominant topic for each document
negative_dominant_topic = np.argmax(negative_df_document_topic.values, axis=1)
negative_df_document_topic['dominant_topic'] = negative_dominant_topic

In [None]:
negative_dom_top = pd.DataFrame(negative_dominant_topic)
negative_dom_top.to_csv('negative_dominant_topic.csv')

In [None]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

In [None]:
# Apply Style
negative_df_document_topics = negative_df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
negative_df_document_topics

In [None]:
negative_df_topic_distribution = negative_df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")

In [None]:
negative_df_topic_distribution.columns = ['Topic Num', 'Num Documents']
negative_df_topic_distribution

In [None]:
pyLDAvis.enable_notebook()
negative_panel = pyLDAvis.sklearn.prepare(negative_lda_model, negative_data_vectorized, negative_vectorizer, mds='tsne')
negative_panel

In [None]:
# Topic-Keyword Matrix
negative_df_topic_keywords = pd.DataFrame(negative_lda_model.components_)

In [None]:
# Assign Column and Index
negative_df_topic_keywords.columns = negative_vectorizer.get_feature_names()
negative_df_topic_keywords.index = negative_topicnames

In [None]:
# View
negative_df_topic_keywords.head()

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=negative_vectorizer, lda_model=negative_lda_model, n_words=20):
    keywords = np.array(negative_vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in negative_lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
negative_topic_keywords = show_topics(vectorizer=negative_vectorizer, lda_model=negative_lda_model, n_words=15)        

In [None]:
# Topic - Keywords Dataframe
negative_df_topic_keywords = pd.DataFrame(negative_topic_keywords)
negative_df_topic_keywords.columns = ['Word '+str(i) for i in range(negative_df_topic_keywords.shape[1])]
negative_df_topic_keywords.index = ['Topic '+str(i) for i in range(negative_df_topic_keywords.shape[0])]
negative_df_topic_keywords

In [None]:
neg_top_key = pd.DataFrame(negative_df_topic_keywords)
neg_top_key.to_csv('negative_topic_keywords.csv')

In [None]:
# Construct the k-means clusters
from sklearn.cluster import KMeans
negative_clusters = KMeans(n_clusters=15, random_state=100).fit_predict(negative_lda_output)

In [None]:
# Build the Singular Value Decomposition(SVD) model
negative_svd_model = TruncatedSVD(n_components=2)  # 2 components
negative_lda_output_svd = negative_svd_model.fit_transform(negative_lda_output)

In [None]:
# X and Y axes of the plot using SVD decomposition
negative_x = negative_lda_output_svd[:, 0]
negative_y = negative_lda_output_svd[:, 1]

In [None]:
# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(negative_svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(negative_svd_model.explained_variance_ratio_, 2))

In [None]:
# Plot
plt.figure(figsize=(12, 12))
plt.scatter(negative_x, negative_y, c=clusters)
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters", )

## Positive Grid Search

In [13]:
# Define Search Param
search_params = {'n_components': [3,5,7], 'learning_decay':[0.5,0.7,0.9]}

# Init the Model
lda = LatentDirichletAllocation(learning_method='online',
                                random_state=172,
                                n_jobs = -1)
    
# Init Grid Search Class
positive_model = GridSearchCV(lda, param_grid=search_params)
    
# Do the Grid Search
positive_model.fit(data_vectorized)

KeyboardInterrupt: 

In [None]:
# Best Model
best_positive_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [3,5,7]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in positive_model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in positive_model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in positive_model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

## Negative Grid Search

In [None]:
# Define Search Param
search_params = {'n_components': [3,5,7],'learning_decay':[0.5,0.7,0.9]}

# Init the Model
negative_lda = LatentDirichletAllocation(learning_method='online',   
                                         random_state=172,
                                         n_jobs = -1)

# Init Grid Search Class
negative_model = GridSearchCV(negative_lda, param_grid=search_params)

with open('negative_grid.csv', 'wb') as f:
    pickle.dump(negative_model, f)
    
with open('negative_grid.csv', 'rb') as f:
    negative_model = pickle.load(f)
    
# Do the Grid Search
negative_model.fit(negative_data_vectorized)

In [None]:
# Best Model
best_negative_lda_model = negative_model.best_estimator_

# Model Parameters
print("Best Model's Params: ", negative_model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", negative_model.best_score_)

# Perplexity
print("Model Perplexity: ", best_negative_lda_model.perplexity(negative_data_vectorized))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [3,5,7]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in negative_model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in negative_model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in negative_model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

# Gensim

In [None]:
# from pprint import pprint

# # Gensim
# import gensim
# from gensim.models.ldamodel import LdaModel
# import gensim.corpora as corpora
# from gensim.utils import simple_preprocess
# from gensim.models import CoherenceModel

# # Plotting tools
# import pyLDAvis
# import pyLDAvis.gensim_models  # don't skip this

In [None]:
# data = hotel.positive_lemma.values.tolist()

In [None]:
# def sent_to_words(sentences):
#     for sentence in sentences:
#         yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# data_words = list(sent_to_words(data))

# print(data_words[:1])

In [None]:
# # Create Dictionary
# id2word = corpora.Dictionary(data_words)

# # Create Corpus
# texts = data_words

# # Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in texts]

# # View
# print(corpus[:1])

In [None]:
# id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
# [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
# Build LDA model
# lda_model = LdaModel(corpus=corpus,id2word=id2word,
#                     num_topics=10, random_state=100)

In [None]:
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [None]:
# # Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word)
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
# vis