## 0.0 - Imports

In [25]:
#from DTM_Pipeline import Pipeline
import pickle
import gensim
import os
import time
import pylab as plt
import numpy as np
import re
import pandas as pd
import seaborn as sb
import itertools
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.models import CoherenceModel
from DTM_Pipeline import get_time_seq
from sklearn import preprocessing

dtm_home = os.environ.get('DTM_HOME', "dtm-master")
dtm_path = os.path.join(dtm_home, 'bin', 'dtm-darwin64') if dtm_home else None
#% matplotlib inline

In [3]:
_MODELS_DIR = "saved_models/"
key="Y02E_10_20"
dict_file = "{}.dict".format(key)
corpus_file = "{}.mm".format(key)
data_file = '../Data/{}.csv'.format(key)
min_slice_size=200

In [4]:
# pull pre-processed data subset
# get corpus
corpus = gensim.corpora.MmCorpus(os.path.join(_MODELS_DIR, corpus_file))

# get dictionary
dictionary = gensim.corpora.Dictionary.load(os.path.join(_MODELS_DIR, dict_file))

# get approved ids
time_seq, approved_ids = get_time_seq(data_file, min_slice_size)

In [5]:
flat_ids = list(itertools.chain(*approved_ids))

In [6]:
# get the cpc subclass that each document belongs to as 'true labels'
data = pd.read_csv(data_file)
true_labels = []
for i in range(len(data)):
    if data["appln_id"].values[i] in flat_ids:
        true_labels.append(float(data["cpc_class_symbol"].values[i][9:]))

In [7]:
np.unique(true_labels, return_counts = True)

(array([  20.,   22.,   28.,  223.,  226.]),
 array([ 791, 1475,  777, 1975, 1382]))

In [8]:
# create training/validation/testing splits.
corp_idx = range(len(corpus))
train_idx = corp_idx[:400]  # [:4800]
val_idx = corp_idx[400:600] # [4800:6000]
test_idx = corp_idx[600:800] # [6000:]

train_ts = [200,200]
val_ts = [200]
test_ts = [200]

train = gensim.utils.SlicedCorpus(corpus, train_idx)
validate = gensim.utils.SlicedCorpus(corpus, val_idx)
test = gensim.utils.SlicedCorpus(corpus, test_idx)

train_labels = true_labels[:400]
validate_labels = true_labels[400:600]
test_labels = true_labels[600:800]

In [9]:
# train DIM, DTM, full LDA, last-year-only LDA

In [10]:
nt = 5
# Static LDA
lda = gensim.models.LdaModel(train, id2word=dictionary,num_topics=nt)



In [11]:
# DTM
dtm = DtmModel(dtm_path,train,train_ts,num_topics=nt,id2word=dictionary,initialize_lda=True)

In [12]:
# DIM
dim = DtmModel(dtm_path,train,train_ts,num_topics=nt,model="fixed",id2word=dictionary,initialize_lda=True)

In [13]:
topics = dim.show_topics(topics=2,times=2, topn=10)
#topics

In [57]:
ivd = {v: k for k, v in dictionary.token2id.iteritems()}
def doc2text(doc, ivd):
    return list(itertools.chain(*[[ivd[word[0]]]*int(word[1]) for word in doc]))
test = doc2text(corpus[0],ivd)


In [98]:
# get topic probability distribution for each document 
raw_lda_doc_dists = np.array(lda[train])
dtm_doc_dists = dtm.gamma_
dim_doc_dists = dim.gamma_

In [99]:
lda_doc_dists = []
for doc in raw_lda_doc_dists:
    row = {key:0 for key in range(nt)}
    for j in range(len(doc)):
        row[doc[j][0]] = doc[j][1]
    lda_doc_dists.append(row.values())

In [102]:
sum(lda_doc_dists[0]) # not sure why document topic percentages don't add up to one exactly...

0.99052895520350792

In [103]:
sum(dtm_doc_dists[0])

1.0

In [104]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

models = {
    "LR": LogisticRegression(), 
    "LDA": LDA(), 
    "QDA": QDA(), 
    "SGD": SGDClassifier(loss='log'),
    "ASGD": SGDClassifier(average=True, loss='log'),
    "Perceptron": Perceptron(),
    #"MLP": MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
    "Passive-Aggressive I": PassiveAggressiveClassifier(loss='hinge',
                                                 C=1.0),
    "Passive-Aggressive II": PassiveAggressiveClassifier(loss='squared_hinge',
                                                  C=1.0),
    "KNN": KNeighborsClassifier(3),
    "Lin. SVC": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1, probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Adaboost": AdaBoostClassifier(),
    "Gaussian NB": GaussianNB()}

In [106]:
def evaluate(x,y,models):
    le = preprocessing.LabelEncoder()
    le.fit(true_labels)

    s_idx = int(np.floor(.8*len(x)))
    x_tr = x[:s_idx]
    x_te = x[s_idx:]

    y_tr = le.transform(np.array(y[:s_idx]))
    y_te = le.transform(np.array(y[s_idx:]))

    preds = {}
    for key in models.keys():
        models[key].fit(x_tr, y_tr)
        preds[key] = models[key].predict(x_te)
        print(key, accuracy_score(y_te, preds[key]))
    

In [107]:
evaluate(dtm_doc_dists, train_labels, models)

('KNN', 0.5)
('Decision Tree', 0.4375)
('Passive-Aggressive I', 0.375)
('RBF SVM', 0.42499999999999999)
('Lin. SVC', 0.42499999999999999)
('Gaussian NB', 0.40000000000000002)
('LDA', 0.48749999999999999)
('ASGD', 0.52500000000000002)
('QDA', 0.375)
('Random Forest', 0.5)
('LR', 0.47499999999999998)
('Passive-Aggressive II', 0.36249999999999999)
('Perceptron', 0.375)
('SGD', 0.48749999999999999)
('Adaboost', 0.36249999999999999)


In [108]:
evaluate(dim_doc_dists, train_labels, models)

('KNN', 0.51249999999999996)
('Decision Tree', 0.46250000000000002)
('Passive-Aggressive I', 0.52500000000000002)
('RBF SVM', 0.61250000000000004)
('Lin. SVC', 0.34999999999999998)
('Gaussian NB', 0.47499999999999998)
('LDA', 0.55000000000000004)
('ASGD', 0.55000000000000004)
('QDA', 0.42499999999999999)
('Random Forest', 0.5625)
('LR', 0.53749999999999998)
('Passive-Aggressive II', 0.5)
('Perceptron', 0.42499999999999999)
('SGD', 0.3125)
('Adaboost', 0.38750000000000001)


In [109]:
evaluate(lda_doc_dists, train_labels, models)

('KNN', 0.36249999999999999)
('Decision Tree', 0.41249999999999998)
('Passive-Aggressive I', 0.45000000000000001)
('RBF SVM', 0.40000000000000002)
('Lin. SVC', 0.34999999999999998)
('Gaussian NB', 0.21249999999999999)
('LDA', 0.40000000000000002)
('ASGD', 0.36249999999999999)
('QDA', 0.012500000000000001)
('Random Forest', 0.34999999999999998)
('LR', 0.36249999999999999)
('Passive-Aggressive II', 0.34999999999999998)
('Perceptron', 0.375)
('SGD', 0.28749999999999998)
('Adaboost', 0.32500000000000001)


In [177]:
def plot_confusion_matrix(cm, tnames, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(tnames))
    plt.xticks(tick_marks, tnames, rotation=45)
    plt.yticks(tick_marks, tnames)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
print(classification_report(class_test_Y, preds))

cm = confusion_matrix(class_test_Y, preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
plt.figure()
plot_confusion_matrix(cm_normalized,[0,1,2,3,4], title='Normalized confusion matrix')
plt.show()

In [145]:
# get validation coherence and tune for peak coherence. Record stats.
# get likelihoods of docs in the test set.
# get cluster quality 

# run with number of topics equal to number of CPC labels
# report classification rates
