In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim_models  # don't skip this

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [2]:
# Load CSV to Dataframe
XYtr = pd.read_csv('data/XYtr.csv')
Xte = pd.read_csv('data/Xte.csv')

In [3]:
# description: use the token None to mean no description
XYtr['description'] = XYtr['description'].fillna('None')
Xte['description'] = Xte['description'].fillna('None')

In [4]:
print('train description missing values: ', XYtr['description'].isnull().sum())
print('test description missing values: ', Xte['description'].isnull().sum())

train description missing values:  0
test description missing values:  0


### Create a corpus

In [None]:
# verifying the dimension of creating a corpus
corpus_copy = XYtr['description'][0]
corpus_copy
len(corpus_copy.split())
# https://stackoverflow.com/questions/49806790/iterable-over-raw-text-documents-expected-string-object-received
corpus_copy=[corpus_copy]
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus_copy)

### Create a corpus using training and test data

In [5]:
# Create corpus using training and test data
corpus = list(XYtr['description'])+list(Xte['description'])

In [6]:
# Length of the list (documents in this case)
# https://stackoverflow.com/questions/12282232/how-do-i-count-unique-values-inside-a-list
len(corpus)

13828

### Create a Document-Word matrix

In [7]:
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
# CountVectorizer creates a matrix in which each unique word is represented by a column of the matrix, 
# and each text sample from the document is a row in the matrix. 
# The value of each cell is nothing but the count of the word in that particular text sample.

# Create a Vectorizer Object
# remove tokens that appear in 10% of the documents
# remove unique tokens that appear in, at most, 2 documents
vectorizer = CountVectorizer(max_df=0.1, min_df=2)

In [8]:
# Encode the documents in a count matrix
corpus_vectorized = vectorizer.fit_transform(corpus)

In [9]:
# Feature names and size
print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))

['002n7' '00b0d' '00jhg' ... 'zztgg' 'zzvdf' 'zzw3j']
10566


In [10]:
# dimension of a sparse matrix of documents (row) vs number of unique words
corpus_vectorized.shape

(13828, 10566)

In [11]:
count_array = corpus_vectorized.toarray()
corpus_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
corpus_df

Unnamed: 0,002n7,00b0d,00jhg,00ud9,00xck,01abs,01fnu,01jsj,01k0e,01nrz,...,zzhb3,zzht0,zzlz3,zznp1,zzns7,zzpvk,zzr1c,zztgg,zzvdf,zzw3j
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13826,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Check for matrix sparsicity

In [12]:
# Materialize the sparse data
corpus_dense = corpus_vectorized.todense()

# Sparsicity = Percentage of Non-Zero cells in document-word matrix 
print("Sparsicity: ", ((corpus_dense > 0).sum()/corpus_dense.size)*100, "%")

Sparsicity:  0.13566460028567626 %


### Select the best LDA model

In [13]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 
                 'learning_decay': [.5, .7, .9],
                 'learning_method': ['online'],
                 'batch_size': [128]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(corpus_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'batch_size': [128], 'learning_decay': [0.5, 0.7, 0.9],
                         'learning_method': ['online'],
                         'n_components': [10, 15, 20, 25, 30]})

In [14]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(corpus_vectorized))

Best Model's Params:  {'batch_size': 128, 'learning_decay': 0.9, 'learning_method': 'online', 'n_components': 10}
Best Log Likelihood Score:  -374253.16670947475
Model Perplexity:  1431.1616992744998


### Build a LDA model

In [15]:
# set topic size
K = 10

In [17]:
# build lda model
lda_model = LatentDirichletAllocation(n_components = K,
                                     learning_method = 'online',
                                      learning_decay = 0.9,
                                      random_state=100, 
                                      batch_size=128, 
                                      evaluate_every = -1, 
                                      n_jobs = -1)
#lda.fit(corpus)
#topics = lda.transform(corpus)
lda_output = lda_model.fit_transform(corpus_vectorized)

In [18]:
print(lda_output.shape)
print(lda_model)

(13828, 10)
LatentDirichletAllocation(learning_decay=0.9, learning_method='online',
                          n_jobs=-1, random_state=100)


In [19]:
lda_output

array([[0.00555709, 0.00555559, 0.00556123, ..., 0.00555795, 0.94998592,
        0.00555582],
       [0.01666667, 0.01666667, 0.01666667, ..., 0.01666874, 0.68706151,
        0.01666667],
       [0.7       , 0.03333333, 0.03333333, ..., 0.03333333, 0.03333333,
        0.03333333],
       ...,
       [0.025     , 0.025     , 0.02500016, ..., 0.025     , 0.025     ,
        0.025     ],
       [0.00476227, 0.00476193, 0.00476195, ..., 0.00476203, 0.004762  ,
        0.00476203],
       [0.7       , 0.03333333, 0.03333333, ..., 0.03333333, 0.03333333,
        0.03333333]])

In [56]:
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# Make the pandas dataframe
# pd.DataFrame.from_records(topics)
train_document_topic = pd.DataFrame(np.round(lda_output[0:6914,], 8), columns = topicnames)
test_document_topic = pd.DataFrame(np.round(lda_output[6914:13828,], 8), columns = topicnames)

In [57]:
train_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,0.005557,0.005556,0.005561,0.005556,0.005558,0.005556,0.005556,0.005558,0.949986,0.005556
1,0.016667,0.016667,0.016667,0.179592,0.016678,0.016667,0.016667,0.016669,0.687062,0.016667
2,0.700000,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333
3,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.962499,0.004167
4,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
...,...,...,...,...,...,...,...,...,...,...
6909,0.115124,0.001961,0.001961,0.001961,0.001961,0.094700,0.001961,0.776450,0.001961,0.001961
6910,0.033333,0.699982,0.033333,0.033333,0.033351,0.033333,0.033333,0.033333,0.033333,0.033333
6911,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.971874,0.003125
6912,0.010000,0.510000,0.010001,0.164334,0.010001,0.010000,0.010000,0.010001,0.255661,0.010002


In [58]:
test_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.009091,0.918180
1,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
2,0.128232,0.007693,0.186306,0.007693,0.007693,0.007695,0.007697,0.631605,0.007694,0.007692
3,0.003226,0.003227,0.796178,0.003226,0.003226,0.003226,0.003226,0.064850,0.046840,0.072775
4,0.012501,0.012500,0.012500,0.012500,0.887497,0.012500,0.012500,0.012501,0.012500,0.012500
...,...,...,...,...,...,...,...,...,...,...
6909,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.989024,0.001220,0.001220,0.001220
6910,0.002565,0.642044,0.182279,0.157726,0.002564,0.002564,0.002564,0.002564,0.002564,0.002564
6911,0.025000,0.025000,0.025000,0.025000,0.775000,0.025000,0.025000,0.025000,0.025000,0.025000
6912,0.004762,0.004762,0.004762,0.004762,0.957142,0.004762,0.004762,0.004762,0.004762,0.004762
