# Topic modeling with gensim

In [31]:
import os
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from gensim.models.phrases import Phrases
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models

In [35]:
mails = pd.read_json('data/mails_lemmatized.json')
texts = mails.text_lem.values.copy()
texts = [[word for word in doc.split(' ') if word !='zNUM' and word != 'questionmark'] for doc in texts]

  and should_run_async(code)


In [36]:
bigram = Phrases(texts)
texts = [bigram[line] for line in texts]
texts = [bigram[line] for line in texts]

  and should_run_async(code)


In [37]:
print(texts[0])

['1ere', 'relecture', 'gt_conso', 'yohan', 'temps', 'clair', 'mettre', 'decibel', 'evolution', 'faire', 'expres', 'mettre', 'evolution', 'correctif', 'cdt_nadege', 'plonquet_responsable', 'domaine_decisionnel', 'souscription', 'direction', 'projet_support', 'metier', 'direction', 'technique_assurance']


  and should_run_async(code)


In [38]:
dictionnary = Dictionary(texts)
corpus = [dictionnary.doc2bow(text) for text in texts]
print(corpus[1])

  and should_run_async(code)


[(9, 3), (20, 1), (21, 1), (22, 1), (23, 1), (24, 3), (25, 1), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1), (32, 1), (33, 3), (34, 3), (35, 1), (36, 1), (37, 1), (38, 2), (39, 5), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 5), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 3), (84, 2), (85, 1), (86, 1), (87, 2), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1)]


## Latent Semantic Indexing

In [39]:
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=dictionnary)
lsi_model.show_topics(num_topics=5)

  and should_run_async(code)


[(0,
  '0.697*"tr_dtm" + 0.258*"cr" + 0.232*"prdt_corr" + 0.178*"direction" + 0.171*"periode_annee" + 0.165*"dtm_crdp" + 0.165*"periode_CD" + 0.147*"mois_arrete" + 0.146*"vision_and" + 0.131*"then_when"'),
 (1,
  '0.418*"direction" + -0.337*"tr_dtm" + 0.244*"bonjour" + 0.225*"decibel" + 0.202*"re" + 0.195*"pouvoir" + 0.168*"donnee" + 0.153*"metier" + 0.147*"technique_assurance" + 0.134*"faire"'),
 (2,
  '-0.524*"ok" + 0.364*"direction" + 0.208*"technique_assurance" + -0.181*"code" + -0.163*"ligne" + -0.155*"table" + 0.140*"projet_support" + 0.128*"re" + -0.122*"hm" + -0.115*"contrat"'),
 (3,
  '0.649*"ok" + 0.301*"direction" + -0.164*"donnee" + 0.157*"projet_support" + 0.148*"technique_assurance" + 0.147*"metier" + -0.145*"code" + -0.140*"contrat" + 0.132*"dpec_den" + -0.129*"ligne"'),
 (4,
  '0.411*"cr" + 0.321*"donnee" + 0.212*"ok" + -0.189*"code" + -0.188*"ligne" + 0.185*"decibel" + -0.169*"direction" + -0.146*"acte" + 0.138*"tribox" + -0.138*"metier"')]

## Hierarchical Dirichlet Process

In [40]:
hdp_model = HdpModel(corpus=corpus, id2word=dictionnary)
hdp_model.show_topics()[:5]

  and should_run_async(code)


[(0,
  '0.024*direction + 0.014*bonjour + 0.014*re + 0.011*decibel + 0.010*pouvoir + 0.009*technique_assurance + 0.008*cr + 0.008*metier + 0.007*faire + 0.007*projet_support + 0.007*donnee + 0.006*cordialement + 0.005*bien + 0.005*souscription + 0.004*harmonie_mutuel + 0.004*probleme + 0.004*cas + 0.004*contrat + 0.004*tribox + 0.004*ligne'),
 (1,
  '0.016*direction + 0.012*bonjour + 0.011*re + 0.010*donnee + 0.008*pouvoir + 0.007*decibel + 0.006*cr + 0.005*operation_transformation + 0.005*corps + 0.005*cordialement + 0.004*faire + 0.004*bureau_pierre + 0.004*metier + 0.004*cordialement_erwan + 0.004*cotisation + 0.004*synergie + 0.004*table + 0.004*pax_support + 0.003*questel_dec + 0.003*probleme'),
 (2,
  '0.012*tr_dtm + 0.012*direction + 0.009*re + 0.008*cr + 0.008*bonjour + 0.005*donnee + 0.005*decibel + 0.005*code + 0.005*pouvoir + 0.005*ligne + 0.005*faire + 0.004*technique_assurance + 0.004*periode + 0.004*prdt_corr + 0.004*metier + 0.003*projet_support + 0.003*cordialement + 0.

## Latent Dirichlet Allocation

In [41]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionnary)
lda_model.show_topics(5)

  and should_run_async(code)


[(2,
  '0.022*"harmonie_mutuel" + 0.019*"code" + 0.012*"pouvoir" + 0.012*"bonjour" + 0.011*"horizon" + 0.010*"commande" + 0.010*"ligne" + 0.008*"reporting" + 0.008*"chable_brice" + 0.007*"savoir"'),
 (3,
  '0.021*"cordialement" + 0.018*"bonjour" + 0.017*"re" + 0.016*"direction" + 0.013*"bien" + 0.012*"cse" + 0.010*"stephan_cabrol" + 0.010*"elu_cse" + 0.009*"groupe_vyv" + 0.009*"tr_dtm"'),
 (0,
  '0.029*"direction" + 0.026*"harmonie_mutuel" + 0.017*"technique_assurance" + 0.017*"entreprise" + 0.015*"cr" + 0.015*"re" + 0.014*"bonjour" + 0.011*"decibel" + 0.010*"faire" + 0.009*"pouvoir"'),
 (1,
  '0.038*"essentiel_information" + 0.036*"harmonie_mutuel" + 0.018*"cliquer_ici" + 0.016*"voir_navigateur" + 0.015*"collaborateur" + 0.015*"rh" + 0.014*"demarche" + 0.011*"espace" + 0.011*"travail" + 0.011*"minute"'),
 (7,
  '0.027*"pouvoir_emmener" + 0.027*"savoir_contenu" + 0.027*"courriel_provenir" + 0.027*"malveillant_ouvrir" + 0.027*"site_web" + 0.027*"cliquer_lien" + 0.027*"reconnaitre_expedi

## Topic Visualization

In [42]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionnary)

  and should_run_async(code)
