<a href="https://colab.research.google.com/github/Moshekwa/Text-Analytucs-Topic-Modelling-for-Power-BI/blob/main/Topic_Modeling_Using_Gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyLDAvis



In [3]:
# Run in python console
#!pip install nltk
import nltk; nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('all')

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Read In Kiva Dataset
df = pd.read_csv('/content/kiva.csv')
print(df.shape)
df

(6818, 7)


Unnamed: 0,country,en,gender,loan_amount,nonpayment,sector,status
0,Dominican Republic,"""Banco Esperanza"" is a group of 10 women looki...",F,1225,partner,Retail,0
1,Dominican Republic,"""Caminemos Hacia Adelante"" or ""Walking Forward...",F,1975,lender,Clothing,0
2,Dominican Republic,"""Creciendo Por La Union"" is a group of 10 peop...",F,2175,partner,Clothing,0
3,Dominican Republic,"""Cristo Vive"" (""Christ lives"" is a group of 10...",F,1425,partner,Clothing,0
4,Dominican Republic,"""Cristo Vive"" is a large group of 35 people, 2...",F,4025,partner,Food,0
...,...,...,...,...,...,...,...
6813,Kenya,Zipporah Wanjiku Wambu is 34 years old and a m...,F,550,lender,Food,1
6814,Kenya,Zirah Ateso Achara is 38 years old she is marr...,F,1000,lender,Food,1
6815,Kenya,Zuleah Amoit is a member in one of the active ...,F,325,lender,Retail,1
6816,Kenya,Zuleya is an active member of PEMCI. She has s...,F,450,partner,Retail,0



# Clean the data

In [None]:

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

# DataFrame after cleaning

In [20]:
df['en_clean']=df['en'].apply(clean)

In [21]:
df

Unnamed: 0,country,en,gender,loan_amount,nonpayment,sector,status,en_clean
0,Dominican Republic,"""Banco Esperanza"" is a group of 10 women looki...",F,1225,partner,Retail,0,"[banco, esperanza, group, 10, woman, looking, ..."
1,Dominican Republic,"""Caminemos Hacia Adelante"" or ""Walking Forward...",F,1975,lender,Clothing,0,"[caminemos, hacia, adelante, walking, forward,..."
2,Dominican Republic,"""Creciendo Por La Union"" is a group of 10 peop...",F,2175,partner,Clothing,0,"[creciendo, por, la, union, group, 10, people,..."
3,Dominican Republic,"""Cristo Vive"" (""Christ lives"" is a group of 10...",F,1425,partner,Clothing,0,"[cristo, vive, christ, life, group, 10, woman,..."
4,Dominican Republic,"""Cristo Vive"" is a large group of 35 people, 2...",F,4025,partner,Food,0,"[cristo, vive, large, group, 35, people, 20, h..."
...,...,...,...,...,...,...,...,...
6813,Kenya,Zipporah Wanjiku Wambu is 34 years old and a m...,F,550,lender,Food,1,"[zipporah, wanjiku, wambu, 34, year, old, moth..."
6814,Kenya,Zirah Ateso Achara is 38 years old she is marr...,F,1000,lender,Food,1,"[zirah, ateso, achara, 38, year, old, married,..."
6815,Kenya,Zuleah Amoit is a member in one of the active ...,F,325,lender,Retail,1,"[zuleah, amoit, member, one, active, group, pe..."
6816,Kenya,Zuleya is an active member of PEMCI. She has s...,F,450,partner,Retail,0,"[zuleya, active, member, pemci, successfully, ..."


# Create Dictionary from the articles

In [22]:
dictionary = corpora.Dictionary(df['en_clean'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

532277


# Create Document Term Matrix

In [24]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['en_clean'] ]
print(len(doc_term_matrix))

6818


# Creating LDA Model

In [26]:
lda = gensim.models.ldamodel.LdaModel

# Fit LDA model on the dataset

In [38]:
num_topics=5
% time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

CPU times: user 2min 52s, sys: 1.29 s, total: 2min 54s
Wall time: 2min 52s


# Saving the model

In [42]:
ldamodel.save('lda.pkl')

#Printing Model Topics

In [39]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.021*"rice" + 0.018*"farmer" + 0.013*"land" + 0.013*"p" + 0.013*"loan" + 0.011*"sector" + 0.010*"baba" + 0.010*"de" + 0.010*"also" + 0.010*"area"'),
 (1,
  '0.028*"business" + 0.021*"p" + 0.018*"year" + 0.014*"loan" + 0.010*"work" + 0.010*"child" + 0.010*"home" + 0.010*"kiva" + 0.009*"sell" + 0.009*"product"'),
 (2,
  '0.024*"loan" + 0.022*"business" + 0.012*"child" + 0.011*"sell" + 0.010*"group" + 0.009*"community" + 0.009*"year" + 0.009*"small" + 0.008*"woman" + 0.008*"clothing"'),
 (3,
  '0.027*"business" + 0.019*"loan" + 0.013*"child" + 0.013*"u" + 0.011*"school" + 0.010*"year" + 0.010*"family" + 0.009*"able" + 0.008*"stock" + 0.007*"buy"'),
 (4,
  '0.025*"loan" + 0.022*"group" + 0.019*"child" + 0.015*"member" + 0.013*"school" + 0.013*"year" + 0.011*"business" + 0.010*"woman" + 0.010*"married" + 0.010*"dairy"')]

# Topic Model Visualization

In [40]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
