Importing Required Libraries

In [None]:
import string
import pandas as pd

import fr_core_news_md
import de_core_news_md

from stop_words import get_stop_words
from nltk.corpus import stopwords

import gensim
from gensim.models import CoherenceModel
from gensim.models import Phrases

import pyLDAvis
import pyLDAvis.gensim_models


Preparing files and NLP - Choose appropriate language

In [None]:
# FR
nlp = fr_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('french')) | set(stopwords.words('french'))
df = pd.read_csv('AdsFullFR3.csv')

In [None]:
# DE
nlp = de_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('german')) | set(stopwords.words('german'))
df = pd.read_csv('AdsFullDE3.csv')

Clean Text

In [None]:
def clean_text(text):
  """
  Cleans the text by removing unnecessary punctuation symbols
  """
  delete_dict = {sp_char: '' for sp_char in string.punctuation}
  delete_dict[' '] =' '
  table = str.maketrans(delete_dict)
  text1 = text.translate(table)
  textArr= text1.split()
  text2 = ' '.join([w for w in textArr if ( not w.isdigit() and
                                           ( not w.isdigit() and len(w)>3))])
  return text2.lower()

In [None]:
df['Description'] = df['Description'].apply(clean_text)

Remove Stopwords

In [None]:
def remove_stopwords(text):
  """
  Removes stopwords from a parameter text
  """
  textArr = text.split(' ')
  rem_text = " ".join([i for i in textArr if i not in stop_words])
  return rem_text

In [None]:
df['Description'] = df['Description'].apply(remove_stopwords)

Text to list

In [None]:
text_list = df['Description'].tolist()

In [None]:
print('Test list: ',text_list[2])

Lemmatization

In [None]:
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ', 'VERB']):
	output = []
	for sent in texts:
		doc = nlp(sent)
		output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
	return output

In [None]:
tokenized_ads = lemmatization(text_list)

In [None]:
bigram = Phrases(tokenized_ads, min_count=25)
for idx in range(len(tokenized_ads)):
    for token in bigram[tokenized_ads[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            tokenized_ads[idx].append(token)
print('List[2]: ',tokenized_ads[2])

Document Term Frequency conversion

In [None]:
id2word = gensim.corpora.Dictionary(tokenized_ads)
id2word.filter_extremes(no_below=20, no_above=0.5)
corpus = [id2word.doc2bow(rev) for rev in tokenized_ads]

Creating an LDA Object from the Gensim Library

In [None]:
LDA = gensim.models.ldamodel.LdaModel

Initialising LDA model as lda_model (5 topics as default)

In [None]:
lda_model = LDA(corpus=corpus, id2word=id2word,
                num_topics=10, random_state=100,
                chunksize=1000, passes=100,iterations=250)

In [None]:
# Print top topics
lda_model.print_topics()
print('Top topics:',lda_model.top_topics(corpus))

Perplexity

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 

Coherence

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=tokenized_ads, dictionary=id2word ,
                                     coherence='u_mass', processes=0)
coherence_lda = coherence_model_lda.get_coherence()

In [None]:
print('Calculating coherence...')
print('Coherence: ', coherence_lda)

Visualising the Data

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
# Optional feature to save graph as html file
pyLDAvis.save_html(vis, 'lda_fr.html')