# HEXIS NLP and ML Notebook

This notebook consists of all the experiments that James Zhao performed for the Hexis project in Spring 2021. 
The first cell consists of initialization for methods for LDA and plotting. 

In [None]:
%matplotlib widget
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim import corpora, models
from scipy.spatial import distance
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc


import nltk
import json
import os

np.random.seed(2018)
nltk.download('wordnet')
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def plot(matrix):
    plt.imshow(np.array(matrix))
    plt.show()
    
def plot_matrixes(matrixes, labels, cols, rows):
    if cols * rows > 1:
        fig, axes = plt.subplots(rows, cols, figsize=(5,rows* 4 * 0.4))
        for i, ax in enumerate(axes.flatten()):
            fig.add_subplot(ax)
            ax.imshow(matrixes[i])
            ax.set_title(labels[i] if len(labels[i]) < 15 else labels[i][:15])
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        fig, ax = plt.subplots()
        ax.imshow(matrixes[0])
        ax.set_title(labels[0] if len(labels[0]) < 15 else labels[0][:15])
    
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()

# Data Loading and Model Creation. 

This cell loads data based on a root directory. Classes are assigned according to the subdirectories of the root directory, and this cell loads each file in each subdirectory. The following cell will run the LDA model with the given parameters and print out each respective topic with their respective top-10 words and weights. 

In [None]:
import os

# raw data of files
files = []
# file names of documents
file_names = []

# root directory of dataset
root_dir = "./corpus_05_30"

dirs = []
# all possible classes (innermost sub-directory)
classes = []

# obtain all classes
for w_root, w_dirs, w_files in os.walk(root_dir):
    if len(w_dirs) == 0:
        dirs.append(w_root)
        classes.append(os.path.basename(w_root))
print(f"classes: {classes}")

dir_index = range(len(dirs))
dirs = [dirs[i] for i in dir_index]

# labels of each document
labels = []

for directory in dirs:
    path = directory
    for file in os.listdir(path):
        with open(os.path.join(path, file), "r", encoding='utf-8') as infile:
            files.append(infile.read())
            file_names.append(file)
            labels.append(os.path.basename(path))

# apply processing to all docs
processed_docs = [preprocess(i) for i in files]

dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Tf-idf dictionaries/corpuses, not used b/c of worse performance
# tfidf = models.TfidfModel(bow_corpus)
# corpus_tfidf = tfidf[bow_corpus]

print(f"length of corpus: {len(processed_docs)}")

In [None]:
# create model
lda_model_tfidf = gensim.models.LdaMulticore(
    bow_corpus, num_topics=10, id2word=dictionary, passes=10, workers=4, minimum_probability=0)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

# Applying LDA model to each document

This cell applies the LDA model to each document, turning them into a n-length vector, where n is the # of chosen topics. It also identifies the dominant topic of each document, as well as the document that is most representative for each n topics. 

In [None]:
# apply lda model to all documents
results = lda_model_tfidf[bow_corpus]
dominant_topics = []

# get most significant topic of each document:
for i, row_list in enumerate(results):
    row = sorted(row_list, key=lambda x: (x[1]), reverse=True)
    topic_num, prop_topic = row[0]
    wp = lda_model_tfidf.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in wp])
    dominant_topics.append((i, topic_num, prop_topic, topic_keywords))
    
# results = [] for pp_file in processed_docs]
results = [[i[1] for i in result] for result in results]
results = np.array(results)

# normalized vectors
norms = np.linalg.norm(results, axis=1)
normalized = results / norms.reshape(-1, 1)

# get most representative topic from each topic
most_representative = []
for i in range(lda_model_tfidf.num_topics):
    docs = [item for item in dominant_topics if item[1] == i]
    if len(docs) == 0:
        most_representative.append(None)
        continue
    else:
        docs = sorted(docs, key=lambda x: (x[2]))
        most_representative.append(docs[0])
print(most_representative)

# Visualizations

The following cells provide useful visualizations. The first constructs word clouds of each of the topics, the next one provides 
more specific details of each topic using an LDA topic visualization library, the third provides a visualization of the distribution of 
document lengths, and the fourth provides a visualization of each topic's word weight and respective word frequency. 

In [None]:
# wordclouds
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 
  

cloud = WordCloud(stopwords=gensim.parsing.preprocessing.STOPWORDS,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i % 10],
                  prefer_horizontal=1.0)

topics = lda_model_tfidf.show_topics(num_topics = 15, formatted=False)

fig, axes = plt.subplots(4, 3, figsize=(7, 7), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    if i < len(topics):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
#Lda vis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
gensimvis.prepare(lda_model_tfidf, bow_corpus, dictionary)



In [None]:
doc_lens = [len(d) for d in processed_docs]

# Plot
fig = plt.figure(figsize=(5,4))
ax = fig.add_subplot(111)
plt.grid()
ax.hist(doc_lens, bins = 100, color='navy')
ax.text(0.8, 0.1, "Mean   : " + str(round(np.mean(doc_lens))), transform=ax.transAxes)
ax.text(0.8, 0.15, "Median : " + str(round(np.median(doc_lens))), transform=ax.transAxes)
ax.text(0.8, 0.2, "Stdev   : " + str(round(np.std(doc_lens))), transform=ax.transAxes)
ax.text(0.8, 0.25, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))), transform=ax.transAxes)
ax.text(0.8, 0.3, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))), transform=ax.transAxes)

ax.set(ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.title('Distribution of Document Word Counts', fontdict=dict(size=12))
plt.show()

In [None]:
# word counts of topic keywords
from collections import Counter
topics = lda_model_tfidf.show_topics(formatted=False)
data_flat = [w for w_list in processed_docs for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, int(counter[word])])


fig, axes = plt.subplots(5, 2, figsize=(8, 15), sharey=True, dpi=100)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    selected = np.array([item for item in out if item[1] == i])
    xax = np.arange(len(selected))
    counts = selected[:,3].astype(int)
    weights = selected[:,2].astype(float)
    
    ax.bar(x=xax, height=counts, color=cols[i], width=0.6, alpha=0.6, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x=xax, height=weights, color='purple', width=0.25, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylabel('Weights', color='purple')
    ax_twin.set_ylim(0, 0.1);
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=True)
    ax.set_xticks(ticks=np.arange(0, 10, 1))
    ax.set_xticklabels(labels=selected[:,0], rotation=90)
    
#     ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()


In [None]:
import random
sample = random.sample(list(zip(file_names, results)), 10)
sel = [item[0] for item in sample]
sel_res = [item[1] for item in sample]

max_len = max([len(title) for title in sel])
print(f"{'Title' : <{max_len}} " + " ".join(f"T{i:<4d}" for i in range(10)))
for i in range(len(sel)):
    fil = sel[i]
    res = sel_res[i]
    print(f"{fil.ljust(max_len)[:max_len]} " + " ".join(f"{res[i]:0.3f}" if res[i] > 1e-3 else "-----" for i in range(10)))
    
    
print(f"# samples: {len(results)}, # classes: {len(classes)}")
for item in classes:
    print(f"Class: {item} Count: {len([sample for sample in labels if sample == item])}")
    

# stopword_list = list(gensim.parsing.preprocessing.STOPWORDS)
# for i in range(5):
#     print(f"{' '.join(stopword_list[i*5:i*5 + 5])} ")
    
# sentence = "Me and Joe carefully walked to the police station."
# print("Original: " + sentence)
# sentence = gensim.utils.simple_preprocess(sentence)
# print("Tokenized: " + str(sentence))
# sentence = [word for word in sentence if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3]
# print("Removed Stopwords: " + str(sentence))
# sentence = [lemmatize_stemming(word) for word in sentence]
# print("Lemmatized: " + str(sentence))

# Theme exploration based on Topic Vectors

The following cell attempts to find some structure between the themes/labels based on the topics of the documents belonging to each theme. 
First, a covariance matrix of each label is constructed for each theme, and then the cosine distance between each covariance matrix
visualizes which themese tend to have similar topic activations. This distance matrix is also plotted. The following cell constructs
a dendrogram uses hierarchical clustering to find topics that tended to be similar.

In [None]:
# Topic Graph for each Theme
cov_matrixes = []
for (idx, item) in enumerate(classes):
    indexes = [i for i in range(len(labels)) if labels[i] == item]
    selected_results = results[indexes]
    cov = np.cov(selected_results.T)
    cov_matrixes.append(cov)

plot_matrixes(cov_matrixes, classes, 3, 4)

cos = []
for i in range(len(cov_matrixes)):
    cos.append([0 for x in range(len(cov_matrixes))])
    for j in range(len(cov_matrixes)):
        cos[i][j] = distance.cosine(
            cov_matrixes[i].reshape(-1), cov_matrixes[j].reshape(-1))
plot_matrixes([np.array(cos)], ["cossims"], 1, 1)


In [None]:
fig = plt.figure(figsize=(5, 5))
dend = shc.dendrogram(shc.linkage(cos, method='ward'), labels=classes, leaf_rotation=-90, leaf_font_size=8)
fig.subplots_adjust(bottom=0.25)
plt.show()

from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree



# Distance Matrixes

The following cell is used for computing distances between topic vectors for each document. Due to the computation being n^2, only run
if a few documents are present. 

In [None]:
jensen_shannon = []
for i in range(len(results)):
    jensen_shannon.append([0 for x in range(len(results))])
    for j in range(len(results)):
        jensen_shannon[i][j] = distance.jensenshannon(
            results[i], results[j])
cos = []
for i in range(len(results)):
    cos.append([0 for x in range(len(results))])
    for j in range(len(results)):
        cos[i][j] = distance.cosine(
            results[i], results[j])

In [None]:
plot(cos)
plot(jensen_shannon)

# KMeans

The following cells attempts to run the K means algorithm to identify clusters among the topic vectors. 

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(init = "random", n_clusters = 4, n_init=10, max_iter=300, random_state=42)
kmeans.fit(normalized)

In [None]:
kmeans.cluster_centers_
# offsets = normalized - kmeans.cluster_centers_
repeated = np.expand_dims(normalized, axis=1)
repeated = np.repeat(repeated, repeats=4, axis=1)
offsets = repeated - kmeans.cluster_centers_
distances = np.linalg.norm(offsets, axis=2)
mins = np.argmin(distances, axis=1)
print(mins)


# KNN

The following cells construct a KNN model to classify the documents based on their normalized topic vectors.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
model = KNeighborsClassifier(n_neighbors=5)
X_train, X_test, y_train, y_test = train_test_split(results, labels, test_size=0.3, random_state=1)
model.fit(X_train,y_train)



In [None]:
predicted = model.predict(X_test)
predicted_index = np.array([classes.index(i) for i in predicted])
successes = np.zeros(len(classes))
totals = np.zeros(len(classes))
for i in range(len(X_test)):
    file = file_names[i]
    label = y_test[i]
    if classes.index(label) == predicted_index[i]:
        successes[classes.index(label)] += 1
    totals[classes.index(label)] += 1
    print(f"i:{i:02} file:{file[:30]}{' '*(30 - len(file))} label:{classes.index(label)} pred:{predicted_index[i]}")
print(f"Theme-wise accuracy: {successes/totals}")
print(f"Overall accuracy: {np.sum(successes)/np.sum(totals)}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
principalComponents = pca.fit_transform(normalized.T)

results_pca = np.matmul(normalized, principalComponents)

In [None]:
from matplotlib import pyplot as plt
# from mpl_toolkits import mplot3d
# mapping = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)    

mapping = np.array(tableau20)

# colors = mapping[predicted_index]
indexes = np.array([classes.index(item) for item in labels])
colors = mapping[indexes]
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter3D(results_pca[:, 0], results_pca[:, 1], results_pca[:, 2], c=colors);
# plt.scatter(results_pca[:, 0], results_pca[:, 1], c=colors)
plt.show()

# Decision Tree

The following cells contain the experiment involving the decision tree.

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(result, labels, test_size=0.3, random_state=1) # 70% training and 30% test

clf = DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))
# text_representation = tree.export_text(clf)
# print(text_representation)

# Print classes
# for (idx, item) in enumerate(classes):
#     print(idx, item)

# fig = plt.figure(figsize=(200, 100))
# _ = tree.plot_tree(clf, 
#                    feature_names=["Theme " + str(i) for i in range(len(lda_model_tfidf.print_topics(-1)))],  
#                    class_names=classes,
#                    filled=True, fontsize=14)
# fig.savefig("decistion_tree.png", dpi=100)

import graphviz
# DOT data
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=["Theme " + str(i) for i in range(len(lda_model_tfidf.print_topics(-1)))],  
                                class_names=classes,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph
graph.render("dt_viz")



# Multiclass SVM

The following cells include a classification experiment using a SVM

In [None]:
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = model_selection.train_test_split(results, labels, test_size=0.3, random_state=1)
model = svm.SVC(kernel="rbf").fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
for i in range(len(y_test)):
    print(f"test: {y_test[i]} | pred: {y_pred[i]}")
print(acc)


# Naive Bayes 

The following involves a classification test using a Naive Bayes Classifier (partly to also get probability vectors of themes for each document)

In [None]:
from sklearn.naive_bayes import GaussianNB
data = results
labels = labels
encoded_labels = [classes.index(label) for label in labels]

X_train, X_test, y_train, y_test = train_test_split(results, encoded_labels, test_size=0.3,random_state=109)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict_proba(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, gnb.predict(X_test)))