# Topic modelling with Gensim : the LDA algorithm

LDA is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.

### 1. Loading of the required libraries and vocabularies.
***



In [1]:
import gensim
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

import nltk
import re
import pprint
import os
import sys


from gensim import corpora

import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords

##nltk.download('stopwords')
##stop_words = stopwords.words('english')

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from pprint import pprint



### 2. Reading and pre-processing input data.
***

File _concepts_4_81.xlsx_ was created during the crawling and scraping process on the SE Glossary articles. In future versions it will be **replaced by direct access to the database**. 

We read this file and process the column _definition_. We drop duplicates (in the definition or the title) and then apply some cleansing steps: replace
anything that is not a letter or a number or a dot by a space, delete multiple spaces, etc. 


In [2]:



df = pd.read_excel ('concepts_4_81.xlsx') 

df = df.drop_duplicates(subset=["definition"])
df = df.dropna(axis=0,subset=["definition"])
df = df.drop_duplicates(subset=["title"])
df = df.dropna(axis=0,subset=["title"])

df.reset_index(drop=True, inplace=True)

df['definition'] = df['definition'].apply(lambda x: re.sub("[^a-z\\.A-Z]"," ",str(x)))   ## Remove anything that is not a letter or a number or a dot
df['definition'] = df['definition'].apply(lambda x: re.sub(" +"," ",str(x)))             ## Remove duplicate spaces    
df['definition'] = df['definition'].apply(lambda x: re.sub('^ +| +$', '',x))   ## remove start and end spaces
df['definition'] = df['definition'].apply(lambda x: re.sub(' , ',', ',x))   ## space-comma-space -> comma-space

df



Unnamed: 0,definition,categories,title
0,The abroad sector consists of all institutions...,"['Glossary', 'Science and technology glossary']",Abroad sector - R & D
1,An acidification gas or acid gas is a natural ...,"['Environment glossary', 'Glossary']",Acidification gas
2,The annual average growth rate abbreviated as ...,"['Glossary', 'Statistical concept']",Annual average growth rate (AAGR)
3,Accidents to one or more persons that are eith...,"['Glossary', 'Statistical indicator', 'Transpo...",Accidents to persons caused by rolling stock i...
4,Accrual recording is the recording of the valu...,"['Glossary', 'Short-term business statistics g...",Accrual recording
...,...,...,...
1281,The administrative source is the register of u...,"['Economy and finance glossary', 'Glossary', '...",Administrative source
1282,Activity limitation is a dimension of health d...,"['Glossary', 'Health glossary']",Activity limitation
1283,Activity rate is the percentage of active pers...,"['Economy and finance glossary', 'Glossary', '...",Activity rate
1284,An active enterprise is an enterprise that had...,"['Economy and finance glossary', 'Glossary', '...",Active enterprise


### 3. Pre-processing input data (cont).
***

Next we tokenize the texts - definitions in the articles, select tokens with minimum length 5, delete stop words and apply a simple pre-processing (convert to lowercase, drop accents). 

The result, _texts_ is a list with 1286 elements corresponding to the records in the dataframe _df_.

In [3]:
#Tokenize texts and clean-up text.
from gensim.parsing.preprocessing import remove_stopwords
def sent_to_words(sentences):
    for sentence in sentences:
        sentence = remove_stopwords(sentence) ## remove stop words
        tokens = gensim.utils.tokenize(sentence)
        sentence = [token for token in tokens if len(token) >= 5] ##minimum length = 5 
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # to lower + deacc=True removes punctuations
        
texts = list(sent_to_words(df['definition']))
print(texts)

len(texts)




1286

### 4. Creation of corpus and terms frequencies.
***

Next we create:
* a dictionary from _texts_ with name _id2word_. This has 7258 unique tokens. 
* a mapping with name _corpus_ of the texts into lists with tuples: (word id, frequency in each text)


In [25]:
#Create Dictionary
id2word = corpora.Dictionary(texts) #Gensim creates a unique id for each word in the document. 
print(id2word,'\n')

## The produced corpus shown above is a mapping of (word_id, word_frequency).

#Alternatively:
corpus = [id2word.doc2bow(text) for text in texts] #corpus package automatically creates a set of corpus reader instances that can be used to access the corpora in the NLTK data package.

print('First 10 texts:\n')
print(corpus[:10])
print('\nTotal texts: ',len(corpus))

#Human readable format of corpus (term-frequency)
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

Dictionary(7258 unique tokens: ['abroad', 'acquired', 'aircraft', 'borders', 'business']...) 

First 10 texts:

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(36, 1), (37, 1), (38, 1), (39, 4), (40, 2), (41, 2), (42, 1), (43, 4), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 3), (52, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 2), (65, 2)], [(28, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 2), (77, 1), (78, 2)], [(62, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1)], [(2, 1), (13, 2), (21, 1), (33, 1), (49, 3), (66, 29), (69, 1), (71, 2), (75, 2), (76,

### 5. Dominant topic in each document.
***



In [6]:
#What is the Dominant topic and its percentage contribution in each document.

def format_topics_sentences(ldamodel=None, corpus=corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [7]:
#Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [27]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus)
df_topic_sents_keywords.rename(columns = {0:'Text tokenized definition'}, inplace = True)
df_topic_sents_keywords


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text tokenized definition
0,18.0,0.2072,"regions, level, urban, common, population, cri...","[abroad, sector, consists, institutions, indiv..."
1,12.0,0.3331,"crops, training, permanent, normally, arable, ...","[acidification, natural, mixture, containing, ..."
2,7.0,0.7678,"number, period, persons, labour, reference, ye...","[annual, average, growth, abbreviated, accurat..."
3,17.0,0.2573,"services, goods, including, example, defined, ...","[accidents, persons, railway, vehicle, object,..."
4,0.0,0.5887,"income, current, account, housing, assets, fin...","[accrual, recording, recording, value, purchas..."
...,...,...,...,...
1281,17.0,0.2850,"services, goods, including, example, defined, ...","[administrative, source, register, units, asso..."
1282,9.0,0.5595,"enterprise, enterprises, health, activities, s...","[activity, limitation, dimension, health, disa..."
1283,7.0,0.2973,"number, period, persons, labour, reference, ye...","[activity, percentage, active, persons, relati..."
1284,7.0,0.2229,"number, period, persons, labour, reference, ye...","[active, enterprise, enterprise, turnover, emp..."


### 6. Most representative document for each topic.
***

**Need to add the document id**


In [10]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)


In [28]:
#Find the most representative document for each topic
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Topic Keywords", "Text tokenized definition"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Topic Keywords,Text tokenized definition
0,0.0,0.7399,"income, current, account, housing, assets, fin...","[distinguish, different, types, housing, slatt..."
1,1.0,0.6956,"accounts, financial, market, protection, econo...","[european, environmental, economic, accounts, ..."
2,2.0,0.665,"agricultural, holding, production, livestock, ...","[holder, agricultural, holding, natural, perso..."
3,3.0,0.7714,"european, member, union, states, economic, sta...","[european, council, refers, regular, meetings,..."
4,4.0,0.6889,"january, united, countries, states, currency, ...","[european, union, abbreviated, economic, polit..."
5,5.0,0.6128,"commission, surface, methods, region, refer, t...","[vineyards, refer, total, surface, vines, viti..."
6,6.0,0.4294,"industrial, plants, trees, dividing, density, ...","[grassland, areas, predominantly, covered, com..."
7,7.0,0.7718,"number, period, persons, labour, reference, ye...","[internet, number, people, accessing, internet..."
8,8.0,0.7597,"country, death, given, group, employed, countr...","[present, official, candidate, countries, memb..."
9,9.0,0.6079,"enterprise, enterprises, health, activities, s...","[minimum, european, health, module, general, q..."


### 7. The topics as a mix of keywords.
***

The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes with a certain weight to the topic.


In [12]:
#Print the Keyword in the 10 topics #FIRST VERSION

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.081*"income" + 0.038*"current" + 0.035*"account" + 0.035*"housing" + '
  '0.031*"assets" + 0.030*"financial" + 0.026*"manure" + 0.022*"transfers" + '
  '0.020*"balance" + 0.018*"investment"'),
 (1,
  '0.050*"accounts" + 0.043*"financial" + 0.041*"market" + 0.036*"protection" '
  '+ 0.036*"economic" + 0.034*"national" + 0.027*"sector" + 0.026*"general" + '
  '0.026*"environmental" + 0.022*"economy"'),
 (2,
  '0.087*"agricultural" + 0.069*"holding" + 0.036*"production" + '
  '0.028*"livestock" + 0.026*"individuals" + 0.023*"family" + 0.020*"common" + '
  '0.019*"holdings" + 0.018*"holder" + 0.018*"buildings"'),
 (3,
  '0.131*"european" + 0.083*"member" + 0.074*"union" + 0.057*"states" + '
  '0.030*"economic" + 0.026*"state" + 0.022*"countries" + 0.021*"abbreviated" '
  '+ 0.019*"regulation" + 0.018*"policy"'),
 (4,
  '0.037*"january" + 0.036*"united" + 0.036*"countries" + 0.025*"states" + '
  '0.025*"currency" + 0.022*"germany" + 0.020*"nations" + 0.018*"agreement" + '
  '0.017

In [13]:
#Visualize the topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [14]:
#Topic distribution across documents

# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,18.0,"regions, level, urban, common, population, cri...",69.0,0.0537
1.0,12.0,"crops, training, permanent, normally, arable, ...",82.0,0.0638
2.0,7.0,"number, period, persons, labour, reference, ye...",47.0,0.0365
3.0,17.0,"services, goods, including, example, defined, ...",107.0,0.0832
4.0,0.0,"income, current, account, housing, assets, fin...",31.0,0.0241
...,...,...,...,...
1281.0,17.0,"services, goods, including, example, defined, ...",,
1282.0,9.0,"enterprise, enterprises, health, activities, s...",,
1283.0,7.0,"number, period, persons, labour, reference, ye...",,
1284.0,7.0,"number, period, persons, labour, reference, ye...",,
