In [23]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [24]:
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

In [36]:
# Load CSV to Dataframe
XYtr = pd.read_csv('data/XYtr.csv')
Xte = pd.read_csv('data/Xte.csv')

In [37]:
# description: use the token None to mean no description
XYtr['description'] = XYtr['description'].fillna('None')
Xte['description'] = Xte['description'].fillna('None')

In [38]:
print('train description missing values: ', XYtr['description'].isnull().sum())
print('test description missing values: ', Xte['description'].isnull().sum())

train description missing values:  0
test description missing values:  0


### Create a corpus

In [18]:
# Create corpus using training and test data
corpus = list(description_train)+list(description_test)
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

In [82]:
# Length of the list (documents in this case)
# https://stackoverflow.com/questions/12282232/how-do-i-count-unique-values-inside-a-list
len(corpus)

13828

In [None]:
# verifying the dimension of creating a corpus
corpus_copy = XYtr['description'][0]
corpus_copy
len(corpus_copy.split())
# https://stackoverflow.com/questions/49806790/iterable-over-raw-text-documents-expected-string-object-received
corpus_copy=[corpus_copy]
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus_copy)

In [122]:
# Create corpus using training and test data
K = 10
corpus = list(XYtr['description'])+list(Xte['description'])

In [123]:
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
# CountVectorizer creates a matrix in which each unique word is represented by a column of the matrix, 
# and each text sample from the document is a row in the matrix. 
# The value of each cell is nothing but the count of the word in that particular text sample.

# Create a Vectorizer Object
# remove tokens that appear in 10% of the documents
# remove unique tokens that appear in, at most, 2 documents
vectorizer = CountVectorizer(max_df=0.1, min_df=2)

In [124]:
# Encode the documents i.e., encoded vectors
corpus = vectorizer.fit_transform(corpus)

In [126]:
len(vectorizer.get_feature_names_out())

10566

In [127]:
# a sparse matrix of documents (row) vs number of unique words
corpus.shape

(13828, 10566)

In [13]:
# build lda model
lda = LatentDirichletAllocation(n_components = K)
lda.fit(corpus)

LatentDirichletAllocation()

In [14]:
topics = lda.transform(corpus)

In [15]:
topics

array([[0.00416699, 0.00416852, 0.31898932, ..., 0.15739918, 0.07343788,
        0.00416713],
       [0.01428634, 0.01428764, 0.01428686, ..., 0.01428603, 0.18196175,
        0.01428695],
       [0.0035718 , 0.9678538 , 0.00357164, ..., 0.00357155, 0.00357161,
        0.00357162],
       ...,
       [0.00303074, 0.00303087, 0.00303063, ..., 0.00303038, 0.00303093,
        0.0030311 ],
       [0.0032263 , 0.00322613, 0.880355  , ..., 0.0938335 , 0.00322687,
        0.00322614],
       [0.0035718 , 0.9678538 , 0.00357164, ..., 0.00357155, 0.00357161,
        0.00357162]])

In [16]:
topics.shape

(13828, 10)

In [17]:
XYtr.shape

(6914, 10)

In [28]:
test = pd.DataFrame.from_records(topics)

In [29]:
test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.532121,0.004167,0.004168,0.076116,0.004168,0.18482,0.004167,0.004167,0.004168,0.181938
1,0.188687,0.014287,0.014286,0.014287,0.014286,0.014286,0.014286,0.014287,0.014286,0.697021
2,0.003572,0.003573,0.003572,0.003572,0.003572,0.003572,0.003572,0.967855,0.003572,0.003572


In [None]:
N = XYtr.shape[0]