In [1]:
# Let's use SKLEARN library to model topic modelling with the dictionaries created
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import os

In [2]:
# Use this function to know what is inside the different folders
def update_folder_contents():
    manuals_pdf_folder = os.listdir(os.getcwd()+"/manuals_pdf")
    pdf_files = [f for f in manuals_pdf_folder if f.endswith('.pdf')]
    manuals_txt_folder = os.listdir(os.getcwd()+"/manuals_txt")
    txt_files = [f for f in manuals_txt_folder if f.endswith('.txt')]
    manuals_dict_folder = os.listdir(os.getcwd()+"/manuals_dict")
    dict_files = [f for f in manuals_dict_folder if f.endswith('.txt')]
    return pdf_files,txt_files,dict_files

In [3]:
# We need to build out doc_term_matrix : column are number of words, and rows are for a specific topic, 
# doc_term_matrix(j,i) is the occurences of a word i within the document j 
pdf_files, txt_files, dict_files = update_folder_contents()
my_words = []
for dict_file in dict_files : 
    with open('manuals_dict/'+dict_file,'r',encoding='utf-8') as filehandle:
        local_dict = eval(filehandle.readline())
        for word in local_dict: 
            if word not in my_words:
                my_words.append(word)
print("Matrix of size : "+str(len(dict_files))+","+str(len(my_words)))
doc_term_matrix = np.zeros((len(dict_files),len(my_words)))

Matrix of size : 68,5029


In [4]:
# We should remove the words that appear in less than 2 documents
# We should remove the words that appear in more that 80% of the documents
topic_index = 0
for dict_file in dict_files : 
    with open('manuals_dict/'+dict_file,'r',encoding='utf-8') as filehandle:
        local_dict = eval(filehandle.readline())
        for word in local_dict: 
            word_index = my_words.index(word)
            doc_term_matrix[topic_index][word_index] += 1
    topic_index += 1
# Let's see in how many topics a specific word appears : stored in words_appearance
words_appearance = []
for i in range(len(my_words)):
    words_appearance.append(0)
    for j in range (len(dict_files)):
        if doc_term_matrix[j][i] != 0:
            words_appearance[i] += 1
# How many words are not_used, how many are over_used ?
not_used = 0
over_used = 0
for word_cnt in words_appearance:
    if word_cnt < 2:
        not_used += 1
    elif word_cnt > 0.8*len(dict_files):
        over_used += 1 
print("There are "+str(not_used)+" not_used words")
print("There are "+str(over_used)+" over_used words")

There are 1786 not_used words
There are 10 over_used words


In [5]:
# Reduce the size of matrix by deleting the column (words) that are not used or over used : improves the learning
doc_term_matrix_2 = doc_term_matrix
for i in range(len(words_appearance)):
    if (words_appearance[len(words_appearance)-i-1] < 2 or words_appearance[len(words_appearance)-i-1] > 0.8*len(dict_files)):
        doc_term_matrix_2 = np.delete(doc_term_matrix_2,len(words_appearance)-i-1,axis=1)
print("Size of the new matrix of occurencies : "+str(doc_term_matrix_2.shape))
print("Size of the former matrix of occurencies : "+str(doc_term_matrix.shape))
# We are now ready to use topic modelling tools !

Size of the new matrix of occurencies : (68, 3233)
Size of the former matrix of occurencies : (68, 5029)


In [6]:
# Create the objct LDA as a topic modelling tool : n_components = 2 because we want two topics
LDA = LatentDirichletAllocation(n_components=2, random_state=42)
LDA.fit(doc_term_matrix_2)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=2, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [7]:
# let's havea look at the 20 most probable words for each topics : this will indicate how the topic selection is done
# If the lists are not consistent with the objective, then we went wrong on sorting the documents
for i in range (len(LDA.components_)):
    topic_words = LDA.components_[i]
    top_topic_word = topic_words.argsort()[-20:]
    top_20_words = []
    for num in top_topic_word:
        top_20_words.append(my_words[num])
    print(top_20_words)
    
# As we may see the topics are not consistent at all with our goal

# First, we may want to use a custom stop word, to avoid specific words that are not significant on clustering the topics
# See the Data_preprocessing_PDF_to_DICT notebook

# Or we may want to use pdf that are only including the setup and installation instructions and NO OTHERS
# I decided to try out a second test using only the pdf that include those

['included', 'full', 'indic', 'briefing', 'power', 'must', 'new', 'user', 'storage', 'led', 'unusable', 'below', 'icon', 'prompt', 'search', 'achievement', 'white', 'buy', 'unmatched', 'generation']
['restarts', 'content', 'screen', 'm', 'game', 'white', 'adapter', 'must', 'devices', 'central', 'calling', 'trigger', 'prompt', 'repeat', 'package', 'taa', 'user', 'cant', 'restrict', 'preview']


In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf = NMF(n_components=2, random_state=42)
nmf.fit(doc_term_matrix_2)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=2, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [10]:
for i in range (len(nmf.components_)):
    topic_words = nmf.components_[i]
    top_topic_word = topic_words.argsort()[-20:]
    top_20_words_2 = []
    for num in top_topic_word:
        top_20_words_2.append(my_words[num])
    print(top_20_words_2)
    
# The vocabulary in associated with the topics are now a little more consistent with our goal.
# We might then think that sorting out the PDF, only including the setup and installation part might be required
# Plus, that will allow faster computations, and better learning

['sentence', 'calling', 'generation', 'storage', 'content', 'single', 'search', 'regulatory', 'white', 'user', 'override', 'video', 'wired', 'devices', 'ay', 'premium', 'unplugging', 'taa', 'restrict', 'cant']
['link', 'mainly', 'must', 'android', 'calling', 'adapter', 'led', 'search', 'white', 'pole', 'unresponsive', 'unusable', 'buy', 'neutral', 'august', 'icon', 'trigger', 'central', 'package', 'prompt']
