In [None]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim import corpora, models
from scipy.spatial import distance
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc


import nltk
import json
import os

np.random.seed(2018)
nltk.download('wordnet')
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def plot(matrix):
    plt.imshow(np.array(matrix))
    plt.show()

In [None]:
files = []
file_names = []
dirs = ["Basic Computer Skills", "Internet Skills",
        "Microsoft Digital Literacy Course", "Microsoft Files", "Microsoft Office Videos"]
dir_index = [0, 1, 2, 3, 4]
dirs = [dirs[i] for i in dir_index]
labels = []

for directory in dirs:
    path = f"./corpus/{directory}"
    for file in os.listdir(path):
        with open(os.path.join(path, file), "r", encoding='utf-8') as infile:
            files.append(infile.read())
            file_names.append(file)
            labels.append(directory)
processed_docs = [preprocess(i) for i in files]

dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
print(processed_docs[34])

In [None]:
# for i in range(len(file_names)):
#     print(f"i: {i}, file: {file_names[i]}")
print(len(labels), labels)

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=100, id2word=dictionary, passes=25, workers=4, minimum_probability=0)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
results = [lda_model_tfidf[dictionary.doc2bow(
    pp_file)] for pp_file in processed_docs]
results = [[i[1] for i in result] for result in results]
results = np.array(results)


In [None]:
# Theme Analysis
means = np.mean(results, axis=0)

print(means)

threshold = 1e-4
relevant_indexes = np.array(np.where(means >= threshold))[0]
print(relevant_indexes)

filtered = results[:,relevant_indexes]
topics = np.array(lda_model_tfidf.print_topics(-1))
selected_topics = topics[relevant_indexes][:, 1]
print(selected_topics[6])
print(filtered)

results = filtered

In [None]:
# print(results)
norms = np.linalg.norm(results, axis=1)
normalized = results / norms.reshape(-1, 1)

# DistanceMatrixes

In [None]:
jensen_shannon = []
for i in range(len(results)):
    jensen_shannon.append([0 for x in range(len(results))])
    for j in range(len(results)):
        jensen_shannon[i][j] = distance.jensenshannon(
            results[i], results[j])
cos = []
for i in range(len(results)):
    cos.append([0 for x in range(len(results))])
    for j in range(len(results)):
        cos[i][j] = distance.cosine(
            results[i], results[j])

In [None]:
plot(cos)
plot(jensen_shannon)

# KMeans

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(init = "random", n_clusters = 4, n_init=10, max_iter=300, random_state=42)
kmeans.fit(normalized)

In [None]:
kmeans.cluster_centers_
# offsets = normalized - kmeans.cluster_centers_
repeated = np.expand_dims(normalized, axis=1)
repeated = np.repeat(repeated, repeats=4, axis=1)
offsets = repeated - kmeans.cluster_centers_
distances = np.linalg.norm(offsets, axis=2)
mins = np.argmin(distances, axis=1)
print(mins)


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(normalized,labels)

In [None]:
predicted = model.predict(results)
predicted_index = np.array([dirs.index(i) for i in predicted])
successes = np.zeros(len(dirs))
totals = np.zeros(len(dirs))
for i in range(len(results)):
    file = file_names[i]
    label = labels[i]
    if dirs.index(label) == predicted_index[i]:
        successes[dirs.index(label)] += 1
    totals[dirs.index(label)] += 1
    print(f"i:{i:02} file:{file[:30]}{' '*(30 - len(file))} label:{dirs.index(label)} pred:{predicted_index[i]}")
print(successes/totals)
print(np.sum(successes)/np.sum(totals))

In [None]:
covariance_matrix = np.cov(normalized.T)
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
projection_matrix = (eigen_vectors.T[:][:3]).T
print(eigen_values)
results_pca = results.dot(projection_matrix)

In [None]:
%matplotlib widget
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
mapping = np.array(["blue", "red", "yellow", "green", "orange"])
colors = mapping[predicted_index]
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(results_pca[:, 0], results_pca[:, 1], results_pca[:, 2], c=colors);
# plt.scatter(results_pca[:, 0], results_pca[:, 1], c=colors)
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(results, labels, test_size=0.3, random_state=1) # 70% training and 30% test

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

for i in range(len(y_test)):
    print(f"{y_test[i]} | {y_pred[i]}")
    
print("asdfasdfsadf")
y_pred = clf.predict(X_train)

for i in range(len(y_test)):
    print(f"{y_train[i]} | {y_pred[i]}")


print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

text_representation = tree.export_text(clf)
print(text_representation)


# Multiclass SVM

In [None]:
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = model_selection.train_test_split(results, labels, test_size=0.3, random_state=1)
model = svm.SVC(kernel="rbf").fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
for i in range(len(y_test)):
    print(f"test: {y_test[i]} | pred: {y_pred[i]}")
print(acc)
