In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim import corpora
from gensim import models
from gensim.models import CoherenceModel
from langid import set_languages, classify
set_languages(['nl', 'en'])

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_notebook
from bokeh.io import push_notebook
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

df_grote_word_doc = pd.read_json(r'C:\Users\johan\Documents\GitHub\sandbox\NLP\data\NLP for grote word-filev2.json')

# stopwords

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('dutch') + stopwords.words('english') 

more_words = ['vooral', 'gaan', 'één', 'value', 'part', 'use', 'blijven', 'waarbij', 'stuk', 'wanneer', 'much', 'kennen', 'always', 'tegelijk', 'however', 'geven', 'nooit', 'weg', 'vaak', 'soort', 'wellicht', 'leggen', 'steken', 'leven', 'zoal,', 'waar', 'allemaal', 'net', 'eigen', 'stefaf', 'vallen', 'zaak', 'feit', 'waaruit', 'zelfs', 'year', 'echter', 'zien', 'come', 'willen', 'spreken', 'straf', 'lijken', 'staan', 'even', 'hoog', 'pas', 'liggen', 'waarom', 'helemaal', 'situatie', 'waaraan', 'zitten', 'take', 'waarin', 'often', 'wel', 'maken', 'nieuw', 'waarop', 'plots', 'say', 'goed', 'way', 'terug', 'mogelijk', 'many', 'daarom', 'omwille', 'leren', 'nemen', 'kijken', 'waarde', 'gebruiken', 'iphone', 'eerder', 'weer', 'zoeken', 'dienen', 'alleen', 'houden', 'see', 'well', 'good', 'deel', 'find', 'misschien', 'make', 'vinden', 'also', 'manier', 'natuurlijk', 'laten', 'louter', 'komen', 'stellen', 'ergens', 'live', 'ver', 'daarentegen', 'facebook', 'steeds', 'time', 'need', 'enkel', 'new', 'nodig', 'vormen', 'halen', 'duidelijk', 'zeggen', 'camera', 'krijgen', 'brengen', 'eigenlijk', 'proberen', 'gewoon', 'heel', 'zeer', 'telkens', 'look', 'eerst', 'belangrijk', 'nochtans', 'waarmee', 'lang', 'zeker']
more_words = more_words
stop_words = list(set(stop_words + more_words))
len(stop_words)

In [None]:
def stopwords_preprocess(stop_words):
    for stop in stop_words:
        sent = gensim.utils.simple_preprocess(str(stop.strip()), deacc=True)
        if sent:
            yield(sent)
            
stop_words += ['zoals']
stop_words = [' '.join(w) for w in list(stopwords_preprocess(stop_words))]

'en' in stop_words

# preprocessing

In [None]:
data = df_grote_word_doc.text.values.tolist()

def sent_to_words(texts):
    for text in texts:
        sent = gensim.utils.simple_preprocess(str(text), deacc=True, min_len=3) 
        yield ' '.join(sent)

# Convert to list
data_words = list(sent_to_words(data))
len(data_words)

> Vector approach needs minimal preprocessing

In [None]:
# nl only
def sent_to_words_nl(texts):
    for text in texts:
        lang, _ = classify(text)
        if lang == 'nl':
            yield text
            
data = df_grote_word_doc.text.values.tolist()

data_words_nl = list(sent_to_words_nl(data))

# document vectors

In [None]:
# !python -m spacy download nl_core_news_md

In [None]:
nlp = spacy.load('nl_core_news_md')

In [None]:
spacy_list = []
for words in data_words_nl:
    spacy_list.append(nlp(words))

In [None]:
doc_vectors = [doc.vector for doc in spacy_list]

In [None]:
sentences = [doc.text for doc in spacy_list]

In [None]:
true_k = 35

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=300, n_init='auto')

model.fit(doc_vectors)

```py
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
```

the direct interpretation of these feature indices might not be straightforward for doc_vectors

Same for feature names from tf-idf

In [None]:
labels = model.labels_

In [None]:
# Perform t-SNE dimensionality reduction
tsne_model = TSNE(n_components=2, 
                  random_state=20, 
                  learning_rate='auto', # 10 is small and 200 is large (but fast)
                  angle=.99, 
                  # init="random",
                  init='pca',
                  perplexity=40, 
                  early_exaggeration=70)

low_dim_data = tsne_model.fit_transform(np.array(doc_vectors))

In [None]:
from bokeh.models import ColumnDataSource, HoverTool, TapTool, CustomJS, Div
from bokeh.layouts import column

output_notebook()
mycolors = np.array(list(mcolors.TABLEAU_COLORS.values()) + \
         [mcolors.to_hex(c) for c in plt.cm.Pastel1.colors] + \
         [mcolors.to_hex(c) for c in plt.cm.Set1.colors])

source = ColumnDataSource(data=dict(
    x=low_dim_data[:, 0],
    y=low_dim_data[:, 1],
    colors=mycolors[labels % len(mycolors)],
    onderwerp=[s[:250] for s in sentences],
    description=sentences,
    # keywords = keyword_list
    )
                          )

hover = HoverTool()
hover.tooltips = [("onderwerp", "@onderwerp"),
                #   ("keywords", "@keywords")
                  ]

description_div = Div(text="", width=1200, height=400)

callback = CustomJS(args=dict(source=source, div=description_div), code="""
    const indices = source.selected.indices;
    if (indices.length == 0)
        return;
    const desc = source.data['description'][indices[0]];
    div.text = desc;
""")
tap_tool = TapTool(callback=callback)

plot = figure(tools="wheel_zoom, reset", title=f"t-SNE Clustering of {true_k} KMeans Topics", width=1200, height=800)
plot.add_tools(hover)
plot.add_tools(tap_tool)

plot.scatter('x', 'y', source=source, color='colors', size=8)

layout = column(plot, description_div)

show(layout)

# Similarity scores

Experimenting with similarity in spacy.

In [None]:
nlp("dom").similarity(nlp("stupid"))

In [None]:
nlp_data = [nlp(d) for d in data if d]

In [None]:
from itertools import combinations

similarities = []
for doc1, doc2 in combinations(nlp_data, 2):
    similarity_score = doc1.similarity(doc2)
    similarities.append((doc1, doc2, similarity_score))

# Sort the similarities list in descending order
similarities.sort(key=lambda x: x[2], reverse=True)

In [None]:
similarities_v2 = [(sim, sam, score) for sim, sam, score in similarities if score < .98][:50] # this gets big

In [None]:
similarities_v2[5:15]

In [None]:
import sys

sys.getsizeof(similarities_v2) # 1 gig = 1 * 10^9 