In [None]:
# Install openpyxl to open excel file
# !pip install openpyxl
# Install spacy for nlp task
#!pip install spacy
#!pip install pyLDAvis
#!python -m pip install -U gensim
#!pip install nltk

# Topic modeling of _De avond is ongemak_ by Marieke Lucas Rijneveld

In this notebook, part 2 (Deel II) of Marieke Lucas Rijneveld's novel _De avond is ongemak_ is analyzed using topic modeling. This notebook is part of a three-part analysis of the novel, which was created in the course of the master thesis ''Naar de overkant: Motieven en topics in Marieke Lucas Rijnevelds _De avond is ongemak_'', written in 2021-2022 at the University of Vienna (degree program: Nederlandistik). All comments are written in English. 

## 1. Create dataframe

### 1.1. Load the required libraries and packages

In [1]:
import os # Supports interaction with the operating system
import pandas as pd # Package to manipulate data structures
import re # Enables working with regular expressions

### 1.2. Convert text file into a dataframe
In a first step the raw text file is converted into a data frame, a two-dimensional table format, for further processing and subsequent analysis.


In [2]:
# Splits .txt file into chunks (sentences) and saves them row after row in a pandas dataframe
string = [] 

with open("data/texts/De_avond_is_ongemak_Deel_II.txt", "r", encoding='utf8') as f:
    full_text = f.read()
    for l in re.split(r"(\.)", full_text):
        if l != ".":
            string.append(l + "\n")
df = pd.DataFrame(string)

In [3]:
# .head() outputs the first 5 rows of the created data frame called "df"
df.head()

Unnamed: 0,0
0,DEEL II\n\n\n\n\n\n\t\t \t\t\t-\n\n\n\n\n\n1\n...
1,Ik vind die groenige bloemknopjes vies\n
2,En als je er een tussen duim en wijsvinger ka...
3,Met een stokje por ik in het mollige achterli...
4,"Er loopt een zwarte streep over zijn rug, hij..."


In [4]:
# Assign column name "text"
df.columns =['text']
df.head()

Unnamed: 0,text
0,DEEL II\n\n\n\n\n\n\t\t \t\t\t-\n\n\n\n\n\n1\n...
1,Ik vind die groenige bloemknopjes vies\n
2,En als je er een tussen duim en wijsvinger ka...
3,Met een stokje por ik in het mollige achterli...
4,"Er loopt een zwarte streep over zijn rug, hij..."


In [5]:
# Save data frame as .csv file for intermediate manual verification steps
df.to_csv('data/DAIO_Deel_II_sentences.csv', encoding='utf8', sep=';')

## 2. Text preprocessing and tokenization

### 2.1. Load the required libraries and packages

In [6]:
# Import libraries and packages
import re # for regular expressions
import gensim # for topic modeling
from gensim.parsing.preprocessing import STOPWORDS # package for stop word processing

import nltk # natural language toolkit
from nltk.tag import PerceptronTagger # part-of-speech tagging package
from nltk.corpus import alpino as alp # dutch corpus for PerceptronTagger
nltk.download('alpino') # download dutch corpus
from nltk.stem import WordNetLemmatizer # for lemmatization
from nltk.corpus import stopwords # package for stop word processing
from nltk.stem.snowball import DutchStemmer # for stemming

import numpy as np # for working with numerical data

import codecs # codec module

import simplemma # for lemmatization

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\isabe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\isabe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 2.2. Create stop word list
A stop word list contains words wich are unnecessary/responsible for negative influence on the generation of topics.

In [8]:
# Open stopword list for Dutch stopwords
stopword_file = open('data/stopwords_nl.txt', 'r')

In [9]:
# Save stop word file as variable
stopwords = stopword_file.read()

In [10]:
# Print stopword list
print(stopwords)

aan,
aangaande,
aangezien,
achte,
achter,
achterna,
af,
afgelopen,
al,
aldaar,
aldus,
alhoewel,
alias,
alle,
allebei,
alleen,
alles,
als,
alsnog,
altijd,
altoos,
ander,
andere,
anders,
anderszins,
beetje,
behalve,
behoudens,
beide,
beiden,
ben,
beneden,
bent,
bepaald,
betreffende,
bij,
bijna,
bijv,
binnen,
binnenin,
blijkbaar,
blijken,
boven,
bovenal,
bovendien,
bovengenoemd,
bovenstaand,
bovenvermeld,
buiten,
bv,
daar,
daardoor,
daarheen,
daarin,
daarna,
daarnet,
daarom,
daarop,
daaruit,
daarvanlangs,
dan,
dat,
de,
deden,
deed,
der,
derde,
derhalve,
dertig,
deze,
dhr,
die,
dikwijls,
dit,
doch,
doe,
doen,
doet,
door,
doorgaand,
drie,
duizend,
dus,
echter,
een,
eens,
eer,
eerdat,
eerder,
eerlang,
eerst,
eerste,
eigen,
eigenlijk,
elk,
elke,
en,
enig,
enige,
enigszins,
enkel,
er,
erdoor,
erg,
ergens,
etc,
etcetera,
even,
eveneens,
evenwel,
gauw,
ge,
gedurende,
geen,
gehad,
gekund,
geleden,
gelijk,
gemoeten,
gemogen,
genoeg,
geweest,
gewoon,
gewoonweg,
haar,
haarzelf,
had,
hadden,
hare,
he

In [11]:
# Split stop word string into a list
stop_words = stopwords.split(',\n')

In [12]:
# Close stopword file
stopword_file.close()

In [13]:
# Display list with stopwords for words wich are unnecessary/responsible for negative influence on the generation of topics
stop_words

['aan',
 'aangaande',
 'aangezien',
 'achte',
 'achter',
 'achterna',
 'af',
 'afgelopen',
 'al',
 'aldaar',
 'aldus',
 'alhoewel',
 'alias',
 'alle',
 'allebei',
 'alleen',
 'alles',
 'als',
 'alsnog',
 'altijd',
 'altoos',
 'ander',
 'andere',
 'anders',
 'anderszins',
 'beetje',
 'behalve',
 'behoudens',
 'beide',
 'beiden',
 'ben',
 'beneden',
 'bent',
 'bepaald',
 'betreffende',
 'bij',
 'bijna',
 'bijv',
 'binnen',
 'binnenin',
 'blijkbaar',
 'blijken',
 'boven',
 'bovenal',
 'bovendien',
 'bovengenoemd',
 'bovenstaand',
 'bovenvermeld',
 'buiten',
 'bv',
 'daar',
 'daardoor',
 'daarheen',
 'daarin',
 'daarna',
 'daarnet',
 'daarom',
 'daarop',
 'daaruit',
 'daarvanlangs',
 'dan',
 'dat',
 'de',
 'deden',
 'deed',
 'der',
 'derde',
 'derhalve',
 'dertig',
 'deze',
 'dhr',
 'die',
 'dikwijls',
 'dit',
 'doch',
 'doe',
 'doen',
 'doet',
 'door',
 'doorgaand',
 'drie',
 'duizend',
 'dus',
 'echter',
 'een',
 'eens',
 'eer',
 'eerdat',
 'eerder',
 'eerlang',
 'eerst',
 'eerste',
 'ei

### 2.2. Stop word removing, lemmatization and tokenization processes

In [15]:
# Loads the fitting lemmatization data for the dutch language
langdata = simplemma.load_data('nl')

In [16]:
# Creates a function to lemmatize a token
def lemmatize(token):
    return simplemma.lemmatize(token, langdata)

In [17]:
# Creates a function to remove stopwords, tokenize and lemmatize the given text
def tokenize(text):
    """Returns tokenized representation of words in lemma form excluding stopwords"""
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words \
                and len(token) > 2:  # drops words with less than 3 characters
            result.append(lemmatize(token))
    return result

### 2.3. Part-of-speech tagging

In [19]:
# Train the PoS tagger on the dutch "alp" corpus
training_corpus = list(alp.tagged_sents()) 
tagger = PerceptronTagger(load=True)
tagger.train(training_corpus)

In [20]:
# Display classes from the tagger function for possible exclusion of word groups
sorted(tagger.classes)

['#',
 '$',
 "''",
 '(',
 ')',
 ',',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``',
 'adj',
 'adv',
 'comp',
 'comparative',
 'det',
 'fixed',
 'name',
 'noun',
 'num',
 'part',
 'pp',
 'prep',
 'pron',
 'punct',
 'tag',
 'verb',
 'vg']

### 2.4. Merging of the created processes

In [None]:
# Variable 'punctuation' with special characters
punctuation = '!”$%&\’()*+,-./:;<=>?[\\]^_`{|}~•@'

In [21]:
# Combines the previously functions into one process
# Apply the following: lowercase, strip punctuation, extra spaces or numbers in text

def preprocess_text(text):
    text = text.lower()  # lower case
    text = re.sub('[' + punctuation + ']+', ' ', text)  # strip punctuation
    text = re.sub('\s+', ' ', text)  # remove double spacing
    text = re.sub('([0-9]+)', '', text)  # remove numbers
    text_token_list = tokenize(text)  # apply lemmatization and tokenization
    pos_comment = tagger.tag(text_token_list) #apply PoS tagging
    text_pos=[word[0] for word in pos_comment if word[1] in ['noun', 'adj']] # remove every class but nouns and adjectives
    text = ' '.join([w for w in text_pos]) # add remaining words
    return text

In [22]:
# Function to read in, clean, preprocess and tokenize text in dataframe column and save tokens in a new column
def tokenize_text(df):
    df['tokens'] = df.text.apply(preprocess_text) # creates new column and saves the functions output into
    num_texts = len(df)
    print('Complete. Number of text chunks that have been cleaned and tokenized: {}'.format(num_texts))
    return df

In [23]:
# Remove string control characters [\\t, \\n, \\r, \t, \n, \r] from data frame strings
df = df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=False)
df.head()

Unnamed: 0,text
0,DEEL II -1Van dichtbij lijken de wratten op de...
1,Ik vind die groenige bloemknopjes vies
2,En als je er een tussen duim en wijsvinger ka...
3,Met een stokje por ik in het mollige achterli...
4,"Er loopt een zwarte streep over zijn rug, hij..."


### 2.5. Application of all functions on the data frame

In [24]:
# Apply tokenize_text on data frame and display cleaned and tokenized number of text chunks
texts_df  = tokenize_text(df)

Complete. Number of text chunks that have been cleaned and tokenized: 2576


In [25]:
# .head() outputs the first 5 rows of the created data frame called "texts_df"
texts_df.head()

Unnamed: 0,text,tokens
0,DEEL II -1Van dichtbij lijken de wratten op de...,wrat pad kappertje
1,Ik vind die groenige bloemknopjes vies,groenig bloemknopjes vies
2,En als je er een tussen duim en wijsvinger ka...,wijsvinger kapotknijpt gifklier pad zurig goed
3,Met een stokje por ik in het mollige achterli...,stok porren mollig achterlijf
4,"Er loopt een zwarte streep over zijn rug, hij...",zwarte strepen rug


In [27]:
# Save new dataframe as .csv file
texts_df.to_csv('data/DAIO_Deel_II_preprocessed_df_all_noun_adj.csv', index=False, header=True, sep=';', encoding='utf-8')

## 3. Topic modeling

In [28]:
# Import libraries and packages
import spacy # NLP library

import gensim.corpora as corpora # for creating a corpora
from gensim.models import CoherenceModel # for creating the coherence model 

from pprint import pprint # provides capability to "pretty-print" data structures

import pyLDAvis.gensim_models # pyLDAvis gensim models
import pickle # for serializing and de-serializing a object structure
import pyLDAvis # data visualization tool

### 3.1. Preprocess tokens

In [29]:
# Assign new variable to data frame
preprocessed_texts = texts_df

In [30]:
# .head() outputs the first 5 rows of the created data frame called "preprocessed_texts"
preprocessed_texts.head()

Unnamed: 0,text,tokens
0,DEEL II -1Van dichtbij lijken de wratten op de...,wrat pad kappertje
1,Ik vind die groenige bloemknopjes vies,groenig bloemknopjes vies
2,En als je er een tussen duim en wijsvinger ka...,wijsvinger kapotknijpt gifklier pad zurig goed
3,Met een stokje por ik in het mollige achterli...,stok porren mollig achterlijf
4,"Er loopt een zwarte streep over zijn rug, hij...",zwarte strepen rug


In [31]:
# Display info of preprocessed df
preprocessed_texts.info

<bound method DataFrame.info of                                                    text  \
0     DEEL II -1Van dichtbij lijken de wratten op de...   
1                Ik vind die groenige bloemknopjes vies   
2      En als je er een tussen duim en wijsvinger ka...   
3      Met een stokje por ik in het mollige achterli...   
4      Er loopt een zwarte streep over zijn rug, hij...   
...                                                 ...   
2571   En heel even hoop ik dat vader naar me toe ko...   
2572  Als ik buitenkom zie ik Obbe zich van zijn weg...   
2573   Hij gooit hem in het protestvuur, dat van oud...   
2574   Konden we onze lichamen maar zo van onszelf o...   
2575                                                      

                                                 tokens  
0                                    wrat pad kappertje  
1                             groenig bloemknopjes vies  
2        wijsvinger kapotknijpt gifklier pad zurig goed  
3                         s

In [32]:
# Create a object which contains the tokens of the preprocessed_texts data frame
texts = preprocessed_texts['tokens']

In [33]:
# .head() outputs the first 5 rows of the object called "texts"
texts.head()

0                                wrat pad kappertje
1                         groenig bloemknopjes vies
2    wijsvinger kapotknijpt gifklier pad zurig goed
3                     stok porren mollig achterlijf
4                                zwarte strepen rug
Name: tokens, dtype: object

In [34]:
# Save texts object values to list
data = texts.values.tolist()

In [35]:
# Create function to convert data into a list of tokens
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

In [36]:
# Create list of lists
data_words = list(sent_to_words(data))

In [37]:
# Output data_words
data_words

[['wrat', 'pad', 'kappertje'],
 ['groenig', 'bloemknopjes', 'vies'],
 ['wijsvinger', 'kapotknijpt', 'gifklier', 'pad', 'zurig', 'goed'],
 ['stok', 'porren', 'mollig', 'achterlijf'],
 ['zwarte', 'strepen', 'rug'],
 ['hard',
  'ruwen',
  'huiden',
  'stok',
  'plooi',
  'buik',
  'warm',
  'asfalt',
  'opwarmen',
  'zonnestraal',
  'voorjaar',
  'slijmerig',
  'beestje',
  'heerlijk'],
 [],
 ['lampion', 'uitdelen', 'gereformeerde', 'kerk'],
 [],
 ['gods', 'woord', 'lamp', 'levenspad', 'dominee', 'renkema', 'kind'],
 ['achten', 'uur', 'kaars', 'helft'],
 ['woord', 'straal'],
 ['lampion', 'pad', 'zwemvlies'],
 ['geboren',
  'reiger',
  'eraf',
  'snoepen',
  'vader',
  'raar',
  'benen',
  'erven',
  'slepen',
  'zwaar',
  'zandslurf',
  'kuilgrasberg'],
 ['limonade', 'milky'],
 ['idee', 'milky', 'plek', 'buurten', 'buik'],
 ['limonade',
  'niesen',
  'tuffen',
  'milky',
  'moutnoga',
  'witten',
  'slaan',
  'gezicht',
  'ziek'],
 [],
 ['milky'],
 ['strepen', 'rug', 'fluisteren', 'pad'],

### 3.2. Dictionary and corpus for the LDA Model

In [39]:
# Creates dictionary which contains the number of times a word appears in "data_words"
id2word = corpora.Dictionary(data_words)

count = 0
for k, v in id2word.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 kappertje
1 pad
2 wrat
3 bloemknopjes
4 groenig
5 vies
6 gifklier
7 goed
8 kapotknijpt
9 wijsvinger
10 zurig


In [40]:
# Filter out tokens that appear in less than 5 documents, more than 0.5 documents, and only the top 10000 tokens
id2word.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)

In [41]:
# Create corpus
texts = data_words

In [42]:
# Convert document into bag-of-words format
corpus = [id2word.doc2bow(text) for text in texts]

### 3.3. Create LDA model

In [43]:
# Variable for number of topics
num_topics = 5

In [44]:
# Build LDA model with pre defined corpus and dictionary
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the topics



In [45]:
# Print the most significant topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.053*"hanna" + 0.042*"moeder" + 0.021*"goed" + 0.021*"vraag" + '
  '0.019*"lang" + 0.016*"obbe" + 0.016*"nieuw" + 0.015*"stuk" + 0.013*"mooi" + '
  '0.013*"juffrouw"'),
 (1,
  '0.028*"groot" + 0.023*"moeder" + 0.022*"goed" + 0.019*"bed" + 0.018*"vader" '
  '+ 0.018*"mond" + 0.016*"vaak" + 0.015*"obbe" + 0.013*"één" + 0.013*"buik"'),
 (2,
  '0.040*"vader" + 0.031*"hoofd" + 0.024*"matthies" + 0.022*"obbe" + '
  '0.020*"moeder" + 0.019*"godin" + 0.015*"doden" + 0.015*"groene" + '
  '0.014*"naasten" + 0.011*"hard"'),
 (3,
  '0.097*"moeder" + 0.094*"vader" + 0.022*"goed" + 0.020*"dag" + 0.017*"lang" '
  '+ 0.015*"mens" + 0.015*"jassen" + 0.011*"snel" + 0.011*"zwarte" + '
  '0.011*"pad"'),
 (4,
  '0.041*"elkaar" + 0.030*"koe" + 0.025*"vader" + 0.023*"bang" + 0.021*"obbe" '
  '+ 0.013*"belle" + 0.012*"pad" + 0.011*"open" + 0.011*"hoofd" + '
  '0.011*"groot"')]


### 3.4. Evaluate model

In [47]:
# Compute Coherence Score (higher is better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', round(coherence_lda, 2))

Coherence Score:  0.51


### 3.5. Visualize LDA model

In [49]:
# Enable display of prepared model data
pyLDAvis.enable_notebook()

# Sets the file path
LDAvis_data_filepath = os.path.join('results/ldavis_prepared_Deel_II_'+str(num_topics))

In [50]:
# Save the computed LDA data to file path
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [51]:
# Load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

In [None]:
# Save final topic model as html file
pyLDAvis.save_html(LDAvis_prepared, 'results/ldavis_prepared_Deel_II_'+ str(num_topics) +'.html')

In [52]:
# Display the final topic model in a dashboard
LDAvis_prepared