In [1]:
import sqlite3
import pandas as pd
from gensim import corpora, models
import re
import numpy as np



In [2]:
conn = sqlite3.connect('2012_to_2017_deputat_speakers.db')
query = "SELECT * FROM filtered_rows"  
filtered_rows = pd.read_sql(query, conn)
conn.close()


In [3]:
class TransformDataForLDA:
    """
    This class provides methods for handling and transforming features in data
    for Latent Dirichlet Allocation (LDA).
    """

    def __init__(self,filtered_rows=filtered_rows):
        """
        Initializes the TransformDataForLDA class.
        """
        import nltk
        from nltk.corpus import stopwords 
        from nltk.stem import WordNetLemmatizer
        import string
        self.df=filtered_rows
        # Load NLTK resources
        nltk.download('wordnet')
        nltk.download('stopwords')
        nltk.download('punkt')
        self.lemmatizer = WordNetLemmatizer()
        # Define stopwords and punctuation
        self.stop_words = set(stopwords.words('dutch'))
        self.punctuation = set(string.punctuation)
        self.words_to_remove =list(set(self.df['speaker_name'].str.title().unique())|
                                    set(self.df['speaker_name'].str.lower().unique())|
                                    set(self.df['party'].str.title().unique())|
                                    set(self.df['party'].str.lower().unique())|
                                    set(['we', 'heer', 'wij', 'mensen', 'voorzitter', 'minister', 'gaan', 'mevrouw', 
                                         'voorzitter', 'wet', 'kamer', 'gepubliceerd'])    
                                         )
        # self.words_to_remove = {
        #     'jank': ,
        #     'speaker_name': list(self.df['speaker_name'].str.title().unique()),
        #     'speaker_name_l': list(self.df['speaker_name'].str.lower().unique()),
        #     'party': list(self.df['party'].str.title().unique()),
        #     'party_l': list(self.df['party'].str.lower().unique())}
        print(self.words_to_remove )
        self.bigram_phraser = None
        self.trigram_phraser = None

    def preprocess_text(self, text):
        """
        Preprocesses the text data by tokenizing, lowercasing, removing stopwords, and removing punctuation.

        Args:
        - text (str): The text data to preprocess.

        Returns:
        - list: A list of preprocessed tokens.
        """
        from nltk.tokenize import word_tokenize
        from nltk.corpus import wordnet
        # Removing punctuation
        text = self.strip_punctuation(text)
        # Text tokenization
        tokens = word_tokenize(text) 
        # Casting to lowercase
        tokens = [token.lower() for token in tokens]
        # Removing stop words, punctuation marks and unnecessary words
        tokens = [token for token in tokens if token not in self.stop_words and token not in self.punctuation and token not in self.words_to_remove]
        # Lemmatization of words
        tokens = [self.lemmatizer.lemmatize(token, wordnet.VERB) for token in tokens]
        # Removing numbers
        tokens = [token for token in tokens if not token.isdigit()]
        # Conversion to bigrams and trigrams
        tokens = self.bigram_phraser[tokens]
        tokens = self.trigram_phraser[tokens]
        return tokens

    

    def strip_punctuation(self, text):
        """
        Strips punctuation marks from the text.

        Args:
        - text (str): The text data to strip punctuation from.

        Returns:
        - str: The text data without punctuation marks.
        """
        return re.sub(r'[^\w\s]', '', text)
    

    def fit_phrasers(self, sentences):
         from gensim.models.phrases import Phrases, Phraser
         # Creating a bigram model
         bigram = Phrases(sentences, min_count=5, threshold=10)
         self.bigram_phraser = Phraser(bigram)
         
         # Creating a trigram model using an already created bigram model
         trigram = Phrases(self.bigram_phraser[sentences], min_count=5, threshold=10)
         self.trigram_phraser = Phraser(trigram)



In [4]:
transformer = TransformDataForLDA()
transformer.fit_phrasers(filtered_rows['speech'])
filtered_rows['processed_speech'] = filtered_rows['speech'].apply(transformer.preprocess_text)
print(filtered_rows['processed_speech'].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elper\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Ahmed Marcouch', 'André Bosman', 'Rens Raemakers', 'Sgp', 'Jesse Klaver', 'Malik Azmani', 'tjeerd van dekken', 'wouter koolmees', 'dion graus', 'uri rosenthal', 'henk leenders', 'Leendert De Lange', 'gerd leers', 'Amma Asante', 'Anouchka Van Miltenburg', 'Tjitske Siderius', 'onafhankelijk', 'Marianne Thieme', 'Frank Wassenberg', 'Arno Rutte', 'ockje tellegen', 'yasemin cegerek', 'angelien eijsink', 'gabriëlle popken', 'ronald vuijk', 'pieter omtzigt', 'jan de wit', 'gidi markuszower', 'arno rutte', 'henk kamp', 'Bram Van Ojik', 'Bente Becker', 'raymond de roon', 'cem lacin', 'Alfonso Boekhoudt', 'otwin van dijk', 'cda', 'Han Ten Broeke', 'tjeenk willink', 'Arie Slob', 'henk bleker', 'maarten hijink', 'Kees Verhoeven', 'jeroen recourt', 'reinette klever', 'martin bosma', 'leendert de lange', 'norbert klein', 'Lammert Van Raan', 'sophia theodora marianne hermans', 'bnl', 'cu', 'jetta klijnsma', 'Betty De Boer', 'Ronald Plasterk', 'Sietse Fritsma', 'Lodewijk Asscher', 'Machiel De Graaf'

In [5]:
# Creation of a dictionary and corpus
dictionary = corpora.Dictionary(filtered_rows['processed_speech'])
dictionary.filter_extremes(no_below=3,no_above=0.4)
corpus = [dictionary.doc2bow(text) for text in filtered_rows['processed_speech']]
import pickle
import os
directory = 'lda_models'
if not os.path.exists(directory):
    os.makedirs(directory)

with open(os.path.join(directory, 'corpus'), 'wb') as f:
    pickle.dump(corpus, f)

# Model training LDA 30 topics
lda_model = models.LdaModel(corpus=corpus,
                             num_topics=30,
                               id2word=dictionary,
                                 passes=15,
                                 random_state=10)

lda_model.save(os.path.join(directory, 'ldamodel_30'))
np.save(os.path.join(directory, 'ldamodel_30'), lda_model.expElogbeta)

topics=lda_model.show_topics(num_topics=30,
                             num_words=100,
                             formatted=False
                             )
with open(os.path.join(directory, 'lda_30_topics'),'wb')as f:
    pickle.dump(topics,f)


In [6]:
# # Model training LDA 50 topics
# lda_model = models.LdaModel(corpus=corpus,
#                              num_topics=50,
#                                id2word=dictionary,
#                                  passes=15,
#                                  random_state=10)

# lda_model.save(os.path.join(directory, 'ldamodel_50'))
# np.save('expElogbeta_50.npy', lda_model.expElogbeta)

# topics=lda_model.show_topics(num_topics=50,
#                              num_words=100,
#                              formatted=False
#                              )
# with open(os.path.join(directory, 'lda_50_topics'),'wb')as f:
#     pickle.dump(topics,f)


In [7]:
# # Model training LDA 10 topics
# lda_model = models.LdaModel(corpus=corpus,
#                              num_topics=10,
#                                id2word=dictionary,
#                                  passes=15,
#                                  random_state=10)

# lda_model.save(os.path.join(directory, 'ldamodel_10'))
# np.save('expElogbeta_10.npy', lda_model.expElogbeta)

# topics=lda_model.show_topics(num_topics=10,
#                              num_words=100,
#                              formatted=False
#                              )
# with open(os.path.join(directory, 'lda_10_topics'),'wb')as f:
#     pickle.dump(topics,f)
