# LDA topic modelling

In [None]:
# importing packages
import os
import pandas as pd
import ast
import pickle 

#Importing packages for LDA
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore

In [None]:
# set working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

### Load data

The hSBM found the following number of topics in each of the datasets:
* German dataset: 60 topics (note: random sample of 20,000 tweets, seed=40, n_min=2)
* Polish dataset: 66 topics (note: random sample of 20,000 tweets, seed=40, n_min=2)
* Danish dataset: 17 topics (note: full sample, n_min=0)

In [None]:
# load dataframe
de = pd.read_csv(r'topic model\de_hsbm_data.csv')
print(de.shape)
de.head(3)

In [None]:
# load dataframe
da = pd.read_csv(r'topic model\da_hsbm_data.csv')
print(da.shape)
da.head(3)

In [None]:
# load dataframe
pl = pd.read_csv(r'topic model\pl_hsbm_data.csv')
print(pl.shape)
pl.head(3)

In [None]:
# function to turn the tokenized list into a readable format
def string_list(text):
    
    # we transform the string representation of the list into an actual list
    text = ast.literal_eval(text)
    
    return text

In [None]:
# apply function to all relevant columns
de['lemma_uni_bi'] = de['lemma_uni_bi'].apply(string_list)
da['lemma_uni_bi'] = da['lemma_uni_bi'].apply(string_list)
pl['lemma_uni_bi'] = pl['lemma_uni_bi'].apply(string_list)

For the German data: subsample to match the hSBM subsample.

In [None]:
# sample
de = de.sample(n=20000, random_state=40)
pl = pl.sample(n=20000, random_state=40)

### Create corpus

In [None]:
# Create a id2word dictionary

# Insert the column where you saved unigram and bigram tokens between the parentheses
de_id2word = Dictionary(de['lemma_uni_bi']) 
da_id2word = Dictionary(da['lemma_uni_bi'])
pl_id2word = Dictionary(pl['lemma_uni_bi'])

# Viewing how many words are in our vocabulary
print('German data:', len(de_id2word))
print('Danish data:', len(da_id2word))
print('Polish data:', len(pl_id2word))

In [None]:
# Use filter_extremes to remove very frequent (those that appear in more than 99.9% of the 
# documents) and very infrequent words (those that appear in less than 10 documents)
de_id2word.filter_extremes(no_below=2, no_above=1)
da_id2word.filter_extremes(no_below=0, no_above=1)
pl_id2word.filter_extremes(no_below=2, no_above=1)

#Viewing how many words are in our vocabulary
print('German data:', len(de_id2word))
print('Danish data:', len(da_id2word))
print('Polish data:', len(pl_id2word))

In [None]:
# creating a corpus object
de_corpus = [de_id2word.doc2bow(doc) for doc in de['lemma_uni_bi']]
da_corpus = [da_id2word.doc2bow(doc) for doc in da['lemma_uni_bi']]
pl_corpus = [pl_id2word.doc2bow(doc) for doc in pl['lemma_uni_bi']]

### Run model

In [None]:
# specify the number of topics
de_topics = 60
da_topics = 21
pl_topics = 66

In [None]:
#Running the LDA with number of topics specified by the hSBM
de_lda_model = LdaMulticore(corpus=de_corpus, num_topics=de_topics, id2word=de_id2word, passes = 1, iterations = 50)
da_lda_model = LdaMulticore(corpus=da_corpus, num_topics=da_topics, id2word=da_id2word, passes = 1, iterations = 50)
pl_lda_model = LdaMulticore(corpus=pl_corpus, num_topics=pl_topics, id2word=pl_id2word, passes = 1, iterations = 50)

In [None]:
# saving the model
pickle.dump(de_lda_model, open(r'topic model\de_lda_sample20_nmin2.sav', 'wb'))
pickle.dump(da_lda_model, open(r'topic model\da_lda_all_nmin0.sav', 'wb'))
pickle.dump(pl_lda_model, open(r'topic model\pl_lda_sample20_nmin2.sav', 'wb'))