Importing Required Libraries

In [2]:
import string
import pandas as pd

import fr_core_news_md
import de_core_news_md

from stop_words import get_stop_words
from nltk.corpus import stopwords

import gensim
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models


Preparing files and NLP - Choose appropriate language

In [None]:
# FR
nlp = fr_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('french')) | set(stopwords.words('french'))
df = pd.read_csv('AdsFullFR3.csv')

In [3]:
# DE
nlp = de_core_news_md.load(disable=['parser', 'ner'])
stop_words = set(get_stop_words('german')) | set(stopwords.words('german'))
df = pd.read_csv('AdsFullDE3.csv')

Clean Text

In [4]:
def clean_text(text):
  """
  Cleans the text by removing unnecessary punctuation symbols
  """
  delete_dict = {sp_char: '' for sp_char in string.punctuation}
  delete_dict[' '] =' '
  table = str.maketrans(delete_dict)
  text1 = text.translate(table)
  textArr= text1.split()
  text2 = ' '.join([w for w in textArr if ( not w.isdigit() and
                                           ( not w.isdigit() and len(w)>3))])
  return text2.lower()

In [5]:
df['Description'] = df['Description'].apply(clean_text)

Remove Stopwords

In [6]:
def remove_stopwords(text):
  """
  Removes stopwords from a parameter text
  """
  textArr = text.split(' ')
  rem_text = " ".join([i for i in textArr if i not in stop_words])
  return rem_text

In [7]:
df['Description'] = df['Description'].apply(remove_stopwords)

Text to list

In [8]:
text_list = df['Description'].tolist()

In [10]:
print('Test list: ',text_list[2])

Test list:  rmorkaniine michel fünck luxemburg aaüi fabrikpreise jftjf gr°s3lierbogthimis3lier8ogthimi luxemburg saargemünder ceramischen produkt« plat kaminröhren pavés rayétrottoirsteine ixxii «100 mosaikplatten farben gattungen ixxaii ltoflfl agtpôt harmornauren artxtxll srlrlr rumine platten fenstertabletten tischplatten ♦♦♦i tißchplatten pcllpr grösse gerber «chüne auswahl kaminen stehen fertig ixixll aufgerichtet einsicht stets vorräthig schie itxti spühlsteiüe dimensionen echte rheinische itttl tuffsteine wände portlandcement engüshe älr«n ss«rieungsröhre mosaikplatten erster qualität franken meter fabrikpreise grosse niederlage dachschiefern dachziegeln dachbrettern zlegelwaaren spfihlstefne ttresham ltofc compagnie angkùe elfte ttlttö dassurances cmjn siege compagnie jetcrp lmdm gemelndvowlngen snccnrsale9 nachbenannten tagen läßt royale bruxelles gemeindeverwaltung paar principes rationnels lesquels sont basées dinstag opérationsj april vor» importance fonds réserve mittags h

Lemmatization

In [11]:
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ', 'VERB']):
	output = []
	for sent in texts:
		doc = nlp(sent)
		output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
	return output

In [12]:
tokenized_ads = lemmatization(text_list)

In [13]:
print('List[2]: ',tokenized_ads[2])

List[2]:  ['Plat', 'kaminröhr', 'Pavés', 'rayétrottoirstein', 'Ixxii', 'mosaikplatter', 'Farbe', 'gattung', 'Ixxaii', 'Harmornaur', 'Platt', 'Fenstertablett', 'Tischplatt', 'Tißchplatte', 'Grösse', 'chün', 'Auswahl', 'Kamine', 'stehen', 'aufgerichten', 'Einsicht', 'Schie', 'Spühlsteiüe', 'dimension', 'echt', 'rheinisch', 'Itttl', 'wände', 'Portlandcement', 'Rieungsröhre', 'mosaikplatten', 'erster', 'Qualität', 'Meter', 'Fabrikpreise', 'grosse', 'Niederlage', 'dachschiefern', 'dachziegeln', 'dachbrettern', 'zlegelwaaren', 'Spfihlstefne', 'Angkùe', 'elfter', 'Ttlttö', 'gemelndvowlng', 'snccnrsale9', 'Nachbenannt', 'Tag', 'lassen', 'Gemeindeverwaltung', 'Dinstag', 'Opérationsj', 'April', 'Fonds', 'Réserve', 'Holzschlage', 'Section', 'Distrikt', 'soin', 'Placement', 'Prudent', 'eich', 'Labstention', 'Spéculation', 'buchen', 'stèr', 'Scheit', 'Sadresser', 'Lagent', 'Gêné', 'oos', 'Späne', 'Donnerstag', 'industriell', 'ell', 'Oäli', 'birken', 'anonym', 'lich', 'Scheit', 'assemblée', 'heurer'

Document Term Frequency conversion

In [14]:
id2word = gensim.corpora.Dictionary(tokenized_ads)
corpus = [id2word.doc2bow(rev) for rev in tokenized_ads]

Creating an LDA Object from the Gensim Library

In [22]:
LDA = gensim.models.ldamodel.LdaModel

Initialising LDA model as lda_model (5 topics as default)

In [23]:
lda_model = LDA(corpus=corpus, id2word=id2word,
                num_topics=30, random_state=100,
                chunksize=1000, passes=5,iterations=50)

In [24]:
# Print top topics
lda_model.print_topics()
print('Top topics:',lda_model.top_topics(corpus))

Top topics: [([(0.027721558, 'Versteigerung'), (0.027026415, 'liegen'), (0.01794845, 'gelegen'), (0.016921815, 'Notar'), (0.0156244645, 'nennen'), (0.0151801, 'Wohnhaus'), (0.014910145, 'lassen'), (0.013338241, 'Garten'), (0.013078363, 'versteigern'), (0.011342371, 'Juni'), (0.011076798, 'finden'), (0.0107118, 'weisen'), (0.010675868, 'Herr'), (0.009092328, 'Haus'), (0.009067201, 'Scheune'), (0.009026517, 'Bann'), (0.008710405, 'Liebhaber'), (0.00826074, 'Wiese'), (0.008244326, 'Straße'), (0.007332339, 'Hand')], -1.4771247503635099), ([(0.016078927, 'Tasse'), (0.013810466, 'Jahr'), (0.009752105, 'Pfund'), (0.008931341, 'Mittel'), (0.008817666, 'machen'), (0.008768772, 'Gebrauch'), (0.008679193, 'leiden'), (0.008641425, 'Gesundheit'), (0.008471957, 'Magen'), (0.008173695, 'erbrechen'), (0.0075803944, 'Krankheit'), (0.0072912276, 'gut'), (0.0064047766, 'herstellen'), (0.006135784, 'husten'), (0.0061138757, 'Kraft'), (0.0060338713, 'kosten'), (0.0059487526, 'tassen'), (0.005762683, 'heile

Perplexity

In [25]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 


Perplexity:  -24.838705621070137


Coherence

In [26]:
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=tokenized_ads, dictionary=id2word ,
                                     coherence='u_mass', processes=4)

In [27]:
print('Calculating coherence...')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence: ', coherence_lda)

Calculating coherence...
Coherence:  -9.941766852430115


Visualising the Data

In [2]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

NameError: name 'pyLDAvis' is not defined