# Introduction to topic modeling with gensim and spaCy

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy

import numpy as np
import pandas as pd
import os

In [None]:
# loading the English model
nlp = spacy.load('en_core_web_sm')

# we don't want to split words with apostrophe
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "'" not in key and "’" not in key and "‘" not in key}

In [None]:
path_to_folder = 'data/topic_models/'

In [None]:
path_to_files=sorted([os.path.join(path_to_folder, f) for f in os.listdir(path_to_folder)])

#### Preprocessing the corpus

In [None]:
# add as many stopwords necessary
extra_stop = ['mr','ms','mrs','hon']

In [None]:
# turning the texts into tokens:
tokenized_corpus = []

for my_file in path_to_files:
    
    with open(my_file, encoding='utf-8') as f:
        text = f.readlines()
        text = ''.join(text).replace('\n',' ')
        
        # turn all the lines into a single string
        text = ''.join(text)
    
        # create the spacy doc object with the text all in lowercase
        doc = nlp(text.lower())

        # filtering tokens and lemmatizing
        text = []
        for word in doc:
            if not word.is_stop and not word.is_punct and not word.like_num and word.lemma_ not in extra_stop:
                #print(word.lemma_)
                text.append(word.lemma_)
            
        tokenized_corpus.append(text)

#### Turning the corpus into bags of words

In [None]:
# mapping words to ids
words_id = corpora.Dictionary(tokenized_corpus)

# corpus becomes a bag of words
corpus = [words_id.doc2bow(txt) for txt in tokenized_corpus]

##### Checking coherence score

In [None]:
# checking "optimal" number of topics
k_init = 5
k_final = 15
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    
    # let's compute perplexity (lower) and coherence score (higher)
    per_lda = lda_model.log_perplexity(corpus)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(k,per_lda,coherence_lda)

##### Running lda model for number of topics with highest cohrence score

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=15, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)

##### Topic composition

In [None]:
lda_model.show_topics(num_words=10,num_topics=15)