Importing Required Libraries

In [1]:
import string
import pandas as pd

import fr_core_news_md
import de_core_news_md

from stop_words import get_stop_words
from nltk.corpus import stopwords

import gensim
from gensim.models import CoherenceModel
from gensim.models import Phrases

import pyLDAvis
import pyLDAvis.gensim_models


  from .autonotebook import tqdm as notebook_tqdm


Preparing files and NLP - Choose appropriate language

In [2]:
# FR
nlp = fr_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('french')) | set(stopwords.words('french'))
df = pd.read_csv('AdsFullFR3.csv')

In [None]:
# DE
nlp = de_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('german')) | set(stopwords.words('german'))
df = pd.read_csv('AdsFullDE3.csv')

Clean Text

In [3]:
def clean_text(text):
  """
  Cleans the text by removing unnecessary punctuation symbols
  """
  delete_dict = {sp_char: '' for sp_char in string.punctuation}
  delete_dict[' '] =' '
  table = str.maketrans(delete_dict)
  text1 = text.translate(table)
  textArr= text1.split()
  text2 = ' '.join([w for w in textArr if ( not w.isdigit() and
                                           ( not w.isdigit() and len(w)>3))])
  return text2.lower()

In [4]:
df['Description'] = df['Description'].apply(clean_text)

Remove Stopwords

In [5]:
def remove_stopwords(text):
  """
  Removes stopwords from a parameter text
  """
  textArr = text.split(' ')
  rem_text = " ".join([i for i in textArr if i not in stop_words])
  return rem_text

In [6]:
df['Description'] = df['Description'].apply(remove_stopwords)

Text to list

In [7]:
text_list = df['Description'].tolist()

In [8]:
print('Test list: ',text_list[2])

Test list:  opérations compagnie assurance décès entière participation fgt0 bénéfices réalises assurance entière têtes assurance temporaire assurance survie assurances mixtes assurances rentes viagères immédiates têtes rentes viagères capitaux différés rentes viagères différées capitaux différés dots enfants plus amples renseignements remise prospectus gratis sadresser lagentgénéral sclirœll stphilippe maison wirtgen luxembourgeois assurés auprès nationale déjà trois réparitions bénfices équivalant versements


Lemmatization

In [9]:
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ', 'VERB']):
	output = []
	for sent in texts:
		doc = nlp(sent)
		output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
	return output

In [11]:
tokenized_ads = lemmatization(text_list)

In [12]:
bigram = Phrases(tokenized_ads, min_count=25)
for idx in range(len(tokenized_ads)):
    for token in bigram[tokenized_ads[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            tokenized_ads[idx].append(token)
print('List[2]: ',tokenized_ads[2])

List[2]:  ['opération', 'compagnie', 'assurance', 'décès', 'entier', 'participation', 'fgt0', 'bénéfice', 'réaliser', 'assurance', 'entier', 'tête', 'assurance', 'temporaire', 'assurance', 'survi', 'assurance', 'mixte', 'assurance', 'rente', 'viager', 'immédiat', 'tête', 'rente', 'viager', 'capital', 'différé', 'rente', 'viager', 'différé', 'capital', 'différé', 'enfant', 'ample', 'renseignement', 'remettre', 'prospectus', 'grati', 'sadresser', 'lagentgénéral', 'maison', 'luxembourgeois', 'assurer', 'national', 'réparitier', 'bénfice', 'équivaloir', 'versement', 'opération_compagnie', 'assurance_décès', 'entier_participation', 'bénéfice_réaliser', 'assurance_entier', 'tête_assurance', 'temporaire_assurance', 'survi_assurance', 'mixte_assurance', 'rente_viager', 'immédiat_tête', 'rente_viager', 'capital_différé', 'rente_viager', 'différé_capital', 'différé_enfant', 'ample_renseignement', 'remettre_prospectus', 'sadresser_lagentgénéral', 'luxembourgeois_assurer', 'équivaloir_versement']


Document Term Frequency conversion

In [13]:
id2word = gensim.corpora.Dictionary(tokenized_ads)
id2word.filter_extremes(no_below=20, no_above=0.5)
corpus = [id2word.doc2bow(rev) for rev in tokenized_ads]

Creating an LDA Object from the Gensim Library

In [14]:
LDA = gensim.models.ldamodel.LdaModel

Initialising LDA model as lda_model (5 topics as default)

In [15]:
lda_model = LDA(corpus=corpus, id2word=id2word,
                num_topics=10, random_state=100,
                chunksize=1000, passes=100,iterations=250)

In [16]:
# Print top topics
lda_model.print_topics()
print('Top topics:',lda_model.top_topics(corpus))

Top topics: [([(0.009035212, 'santé'), (0.008049848, 'nerf'), (0.007857089, 'tasse'), (0.007690646, 'souffrance'), (0.0069398563, 'arabica'), (0.006016322, 'digestion'), (0.0056558293, 'repas'), (0.005421685, 'lestomac'), (0.005110824, 'gastralgie'), (0.005079033, 'remède'), (0.004633199, 'mauvais'), (0.0045205425, 'irritation'), (0.004210777, 'toux'), (0.0042038257, 'guérir'), (0.0040918067, 'tête'), (0.0040906905, 'rein'), (0.004004669, 'revalenta'), (0.003969617, 'constipation'), (0.0039694216, 'névralgie'), (0.003906678, 'opérer')], -0.49738264854062136), ([(0.016621964, 'maison'), (0.01520073, 'vente'), (0.013605085, 'lieu'), (0.012844302, 'notair'), (0.01006228, 'situer'), (0.008962641, 'heure'), (0.008806027, 'jardin'), (0.0082636, 'are'), (0.0076418323, 'terre'), (0.007637101, 'sieur'), (0.0072462545, 'public'), (0.007005537, 'vendre'), (0.006822117, 'relever'), (0.006545693, 'soussigner'), (0.0064850836, 'notaire'), (0.0064360257, 'requête'), (0.006103685, 'sadresser'), (0.005

Perplexity

In [17]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 


Perplexity:  -8.118164415367433


Coherence

In [18]:
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=tokenized_ads, dictionary=id2word ,
                                     coherence='u_mass', processes=0)
coherence_lda = coherence_model_lda.get_coherence()

In [19]:
print('Calculating coherence...')
print('Coherence: ', coherence_lda)

Calculating coherence...
Coherence:  -1.699912817358276


Visualising the Data

In [20]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [21]:
# Optional feature to save graph as html file
pyLDAvis.save_html(vis, 'lda_fr.html')