In [1]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [2]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
# Read data
XYtr = pd.read_csv('data/XYtr.csv')
Xte = pd.read_csv('data/Xte.csv')

### Make corpus and vocab

In [18]:
# Make corpus and vocab

K = 10
XYtr['description'] = XYtr['description'].fillna("NAN")
Xte['description'] = Xte['description'].fillna("NAN")

In [19]:
corpus = list(XYtr['description'])+list(Xte['description'])

In [20]:
# Length of the list (documents in this case)
# https://stackoverflow.com/questions/12282232/how-do-i-count-unique-values-inside-a-list
len(corpus)

13828

In [None]:
# verifying the dimension of creating a corpus
corpus_copy = XYtr['description'][0]
corpus_copy
len(corpus_copy.split())
# https://stackoverflow.com/questions/49806790/iterable-over-raw-text-documents-expected-string-object-received
corpus_copy=[corpus_copy]
vectorizer = CountVectorizer()
corpus = vectorizer.fit_transform(corpus_copy)

In [21]:
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
# CountVectorizer creates a matrix in which each unique word is represented by a column of the matrix, 
# and each text sample from the document is a row in the matrix. 
# The value of each cell is nothing but the count of the word in that particular text sample.
# Create a Vectorizer Object
vectorizer = CountVectorizer()
# Encode the documents
corpus = vectorizer.fit_transform(corpus)

In [33]:
len(vectorizer.get_feature_names_out())

14381

In [22]:
# a sparse matrix of documents (row) vs number of unique words
corpus.shape

(13828, 14381)

In [23]:
# build lda model
lda = LatentDirichletAllocation(n_components = K)
lda.fit(corpus)

LatentDirichletAllocation()

In [24]:
topics = lda.transform(corpus)

In [25]:
topics

array([[0.53212086, 0.00416747, 0.00416771, ..., 0.00416705, 0.00416767,
        0.18193814],
       [0.18868668, 0.01428692, 0.01428627, ..., 0.01428716, 0.01428645,
        0.69702068],
       [0.00357163, 0.00357257, 0.00357162, ..., 0.96785461, 0.00357161,
        0.00357163],
       ...,
       [0.00303045, 0.97272447, 0.00303075, ..., 0.00303103, 0.00303116,
        0.00303046],
       [0.00322623, 0.00322687, 0.05735878, ..., 0.00322637, 0.00322625,
        0.00322657],
       [0.00357163, 0.00357257, 0.00357162, ..., 0.96785461, 0.00357161,
        0.00357163]])

In [26]:
topics.shape

(13828, 10)

In [27]:
XYtr.shape

(6914, 10)

In [28]:
test = pd.DataFrame.from_records(topics)

In [29]:
test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.532121,0.004167,0.004168,0.076116,0.004168,0.18482,0.004167,0.004167,0.004168,0.181938
1,0.188687,0.014287,0.014286,0.014287,0.014286,0.014286,0.014286,0.014287,0.014286,0.697021
2,0.003572,0.003573,0.003572,0.003572,0.003572,0.003572,0.003572,0.967855,0.003572,0.003572


In [None]:
N = XYtr.shape[0]

In [None]:
fp = open('data/XYtr_ft.csv', 'w')
fp.write('id')
for k in range(K):
    fp.write(',FT%04d' % k)

fp.write('\n')
for i in range(N):
    id = XYtr.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K):
        fp.write(',%f' % topics[i, k])
    
    fp.write('\n')

fp.close()    

fp = open('data/Xte_ft.csv', 'w')
fp.write('id')
for k in range(K):
    fp.write(',FT%04d' % k)

fp.write('\n')
for i in range(N):
    id = Xte.loc[i,'id']
    fp.write('%s' % id)
    for k in range(K):
        fp.write(',%f' % topics[i + N, k])
    
    fp.write('\n')

fp.close()    