# NLP for `de grote word file`

This is the start of a series in which I try to visualise my wordfile in clusters of meaning.

This focusses on LDA. 

In [None]:
import re
import numpy as np
import pandas as pd
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Import the docx as a dict

In [None]:
from docx import Document

output = []

chunk = {}

current_heading = None
doc = Document("D:/OneDrive/_Schrijverij/Boek/die ene grote word-file.docx")
for paragraph in doc.paragraphs:
    if paragraph.style.name == 'Heading 1':
        # Save previous chunk if any
        if chunk:
            output.append(chunk)
        
        # Start a new chunk
        current_heading = paragraph.text
        chunk = {'heading': current_heading, 'text': ""}
        
    elif current_heading is not None:
        if paragraph.text.strip() == '':
            if chunk: 
                output.append(chunk)
            chunk = {'heading': current_heading, 'text': ""}
        else:
            chunk['text'] += paragraph.text.strip() + "\n"

# Append the last chunk
if chunk:
    output.append(chunk)

# print(output)

# Turn it into a df

In [None]:
df_grote_word_doc = pd.DataFrame(output)

df_grote_word_doc.to_json('NLP for grote word-file.json')

In [None]:
df_grote_word_doc = pd.read_json(r'C:\Users\johan\Documents\GitHub\sandbox\NLP\data\NLP for grote word-file.json')
df_grote_word_doc

In [None]:
# to get rid of rows with empty strings in the text column
df_grote_word_doc = df_grote_word_doc[df_grote_word_doc['text'] != '']

# 22 duplicates
df_grote_word_doc = df_grote_word_doc.drop_duplicates(subset='text', keep='last')
df_grote_word_doc.describe()

In [None]:
df_grote_word_doc.to_json('NLP for grote word-filev2.json')

In [None]:
df_grote_word_doc = pd.read_json(r'C:\Users\johan\Documents\GitHub\sandbox\NLP\data\NLP for grote word-filev2.json')
df_grote_word_doc.head()

## Turn it into a list

In [None]:
def sent_to_words(texts):
    for text in texts:
        sent = gensim.utils.simple_preprocess(str(text), deacc=True, min_len=3) 
        yield(sent)

# Convert to list
data = df_grote_word_doc.text.values.tolist()
data_words = list(sent_to_words(data))
# data_words

## Identify stopwords

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('dutch') + stopwords.words('english') 
more_words = ['vooral', 'gaan', 'één', 'value', 'part', 'use', 'blijven', 'waarbij', 'stuk', 'wanneer', 'much', 'kennen', 'always', 'tegelijk', 'however', 'geven', 'nooit', 'weg', 'vaak', 'soort', 'wellicht', 'leggen', 'steken', 'leven', 'zoal,', 'waar', 'allemaal', 'net', 'eigen', 'stefaf', 'vallen', 'zaak', 'feit', 'waaruit', 'zelfs', 'year', 'echter', 'zien', 'come', 'willen', 'spreken', 'straf', 'lijken', 'staan', 'even', 'hoog', 'pas', 'liggen', 'waarom', 'helemaal', 'situatie', 'waaraan', 'zitten', 'take', 'waarin', 'often', 'wel', 'maken', 'nieuw', 'waarop', 'plots', 'say', 'goed', 'way', 'terug', 'mogelijk', 'many', 'daarom', 'omwille', 'leren', 'nemen', 'kijken', 'waarde', 'gebruiken', 'iphone', 'eerder', 'weer', 'zoeken', 'dienen', 'alleen', 'houden', 'see', 'well', 'good', 'deel', 'find', 'misschien', 'make', 'vinden', 'also', 'manier', 'natuurlijk', 'laten', 'louter', 'komen', 'stellen', 'ergens', 'live', 'ver', 'daarentegen', 'facebook', 'steeds', 'time', 'need', 'enkel', 'new', 'nodig', 'vormen', 'halen', 'duidelijk', 'zeggen', 'camera', 'krijgen', 'brengen', 'eigenlijk', 'proberen', 'gewoon', 'heel', 'zeer', 'telkens', 'look', 'eerst', 'belangrijk', 'nochtans', 'waarmee', 'lang', 'zeker']
more_words = list(set(more_words))
stop_words = set(stop_words + more_words)
len(stop_words)

In [None]:
import logging
logging.basicConfig(level=logging.ERROR)

# N-gram and lemmatizer

In [None]:
from langid import set_languages, classify

set_languages(['nl', 'en'])

In [None]:
# check language, lemmatize, remove stopwords
def process_lang(mails, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """language detection and Lemmatization"""
    
    mails = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in mails]
    
    texts_out = []
    
    nlp_en = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    nlp_nl = spacy.load("nl_core_news_sm", disable=['parser', 'ner'])
    
    for mail in mails:
        mail_text = []
        for i in range(0, len(mail) - 2, 3):
            term = mail[i] + " " + mail[i+1] + " " + mail[i+2]
            lang, _ = classify(term)
            if lang == 'nl':
                doc = nlp_nl(term) 
            else:
                doc = nlp_en(term) 
                            
            # print(lang, doc, [token.lemma_ for token in doc])
            
            mail_text.extend(doc)
                        
        texts_out.append([token.lemma_ for token in mail_text if token.pos_ in allowed_postags])
            
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]  
    return texts_out

In [None]:
# DUTCH only, lemmatize, remove stopwords
def process_lang(mails, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """language detection and Lemmatization"""
    
    mails = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in mails]
    
    texts_out = []
    
    # nlp_en = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    nlp_nl = spacy.load("nl_core_news_sm", disable=['parser', 'ner'])
    
    for mail in mails:
        mail_text = []
        for i in range(0, len(mail) - 2, 3):
            term = mail[i] + " " + mail[i+1] + " " + mail[i+2]
            lang, _ = classify(term)
            if lang == 'nl':
                doc = nlp_nl(term) 
            # else:
            #     doc = nlp_en(term) 
            
                mail_text.extend(doc)
                        
        texts_out.append([token.lemma_ for token in mail_text if token.pos_ in allowed_postags])
            
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]  
    return texts_out

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=30) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=30)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def process_ngrams(texts, stop_words=stop_words):
    """Remove Stopwords, Form Bigrams, Trigrams"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    return texts

In [None]:
# this does ALL the preprocessing (slow)
data_words = process_lang(data_words)
data_ready = process_ngrams(data_words)

## Make a dictionary and corpus

the dictionary stores each token (word) that exists in each list in an alphabetical order

In [None]:
from gensim import corpora
from gensim import models
from gensim.models import CoherenceModel

id2word  = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]

print(id2word)

In [None]:
[i for i,j in id2word.most_common(100)] # check for uninformative words

## Run model

In [None]:
lda_model = models.LdaModel(corpus=corpus,
                            id2word=id2word,
                            num_topics=4, 
                            random_state=75,
                            chunksize=350, # 20 is too low and 800 too high
                            passes=35,
                            iterations=10,
                            eta='auto', 
                            alpha='auto', 
                            eval_every=5,
                            per_word_topics=True
                            )

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('Coherence Score: ', coherence_lda)

In [None]:
coherence_model_lda.get_coherence_per_topic()

This helped a lot to improve the number of topics

In [None]:
import matplotlib.pyplot as plt

def compute_coherence_values(start, limit,step):
    coherence_values = []
    for num_topics in range(start, limit, step):
        model = models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           chunksize=300, 
                                           )
        
        coherencemodel = CoherenceModel(model=model, texts=data_ready, dictionary=id2word, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return coherence_values

start=2
limit=25
step=4
coherence_values = compute_coherence_values(start, limit, step)
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))