In [276]:
import json
import os
import pandas as pd
import collections
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn import decomposition
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import sklearn
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier


In [178]:
doc2cat = {}
t = '''C1: Airline Safety
C2: Amphertamine
C3: China and Spy Plan and Captives 
C4: Hoof and Mouth Desease
C5: Iran Nuclear
C6: Korea and Nuclear Capability 
C7: Mortrage Rates
C8: Ocean and Pollution
C9: Satanic Cult
C10: Store Irene
C11: Volcano
C12: Saddam Hussein
C13: Kim Jong-un
C14: Predictive Analytics 
C15: Irma & Harvey'''.split("\n")

In [179]:
for i in t:
    temp = i.split(":")
    doc2cat[temp[0]] = temp[1][1:]

In [194]:
#my implementation of KNN, for Q1
euc = lambda x,y:np.sqrt(np.sum((np.array(x)-np.array(y))**2)) 
def my_KNN(train,test, k):
    pred = [];
    for key in test:
        case = test[key]
        temp = {}
        for tr in train:
            temp[tr] = euc(case,train[tr])
        first_k = sorted(temp, key=temp.__getitem__)[:k]
        result = collections.defaultdict(int)
        for doc in first_k:
            result[doc.split('/')[0]]+=temp[doc] 
        temp_pred = sorted(result, key=result.__getitem__)[0]
        pred.append(temp_pred)
    
    return pred
        
        

In [201]:
#Sklearn's implementation of KNN, for Q3
def KNN(train,test,k):
    X_train = list(train.values())
    y_train = [i.split('/')[0] for i in train.keys()]
    X_test = list(test.values())
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train) 
    return neigh.predict(X_test)

In [None]:
def KNN(train,test,k):
    X_train = list(train.values())
    y_train = [i.split('/')[0] for i in train.keys()]
    X_test = list(test.values())
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train) 
    return neigh.predict(X_test)

In [259]:
#sklearn's svm
def SVM(train,test):
    X_train = list(train.values())
    y_train = [i.split('/')[0] for i in train.keys()]
    X_test = list(test.values())
    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

In [302]:
#sklearn's random forest
def rdf(train,test):
    X_train = list(train.values())
    y_train = [i.split('/')[0] for i in train.keys()]
    X_test = list(test.values())
    clf = RandomForestClassifier(n_estimators=66,criterion='entropy', min_samples_split=3)
    clf.fit(X_train,y_train)
    return clf.predict(X_test)
    
 

In [306]:
#ensemble the above three models
def ensemble(one,two,three):
    ret = []
    for index,y1 in enumerate(one):
        pred = collections.defaultdict(int)
        pred[two[index]]+=1;
        pred[three[index]]+=1;
        pred[y1]+=1;
        ret.append(sorted(pred, key=pred.__getitem__)[0])
        
    return ret
        

In [94]:
#tokenize txt files
texts = []
keys = []
for filename in os.listdir(os.getcwd()+'/DataSet'):
    if filename.startswith("C"):
        add = os.getcwd()+'/DataSet/'+filename;
        for txt in os.listdir(add):
            if txt.endswith("txt"):
                with open(add+"//"+txt, 'r',encoding='iso-8859-15') as myfile:
                        key = filename+"/"+txt;
                        data=myfile.read()
                        tokens = TaggedDocument(words=nltk.word_tokenize(data), tags=[key])   
                        texts.append(tokens)
                        keys.append(key)

print("All text files are inputed")

All text files are inputed


In [95]:
#params for doc2vec
sz = 11
wd = 6
min_c = 2
epochs = 60

In [213]:
#using doc2vec to convert tokenized document into vectors, this method is strictly better 
#than TF-IDF in my opinion, and is widely used in industry nowadays.
model = doc2vec.Doc2Vec(vector_size=sz,window = wd, min_count=min_c)
model.build_vocab(texts) 
model.train(texts, total_examples=model.corpus_count, epochs=epochs)

In [230]:
#the best k I find for KNN was 3 
k = 3
kf = KFold(n_splits=10, shuffle=True, random_state=66)
data = {}
keys = np.array(keys)
for key in keys:
    data[key] = model.docvecs[key]

In [308]:
#K-fold to evaluate performance, for Q2
#f1 = 0;
acc_knn = 0;
acc_svm = 0;
acc_rdf = 0;
acc_ensemble = 0
for train_index, test_index in kf.split(keys):
    train = {}
    test = {}
    for key in keys[train_index]:
        train[key] = data[key]
    for key in keys[test_index]:
        test[key] = data[key]
    y_true = [i.split('/')[0] for i in test.keys()]


    #prediction with my KNN
    #y_pred_my = my_KNN(train,test, k)
    #print(f1_score(y_true,y_pred_my, average='macro'))
    #print(accuracy_score(y_true,y_pred_my))

    #prediction with sklearn KNN, svm, xgboost etc.
    y_pred_knn = KNN(train,test,k)
    
    #Q5 Bonus point, apply SVM
    y_pred_svm = SVM(train,test)
    
    #Q6 MODEL ensemble.
    y_pred_rdf = rdf(train,test)
    y_pred_ensemble = ensemble(y_pred_knn,y_pred_svm, y_pred_rdf)
    
    
    
    #evaluate the performance using f1 score and accuracy score, for Q4
    #actually, for multi-class classif, f1_micro = accuracy
    #since KNN doesn't support prediction of probability, we can use multi-class cross entropy
    #yet auc is not applicable to multi-class
    #f1 +=f1_score(y_true,y_pred_knn, average='micro')
    acc_knn +=accuracy_score(y_true,y_pred_knn)
    acc_svm += accuracy_score(y_true,y_pred_svm)
    acc_rdf += accuracy_score(y_true,y_pred_rdf)
    acc_ensemble += accuracy_score(y_true,y_pred_ensemble)
    

#print("F1: "+str(f1/10))
print("Accuracy KNN: "+str(acc_knn/10))
print("Accuracy SVM: "+str(acc_svm/10))
print("Accuracy RDF: "+str(acc_rdf/10))
print("Accuracy ensemble: "+str(acc_ensemble/10))

Accuracy KNN: 0.787179487179
Accuracy SVM: 0.812820512821
Accuracy RDF: 0.70641025641
Accuracy ensemble: 0.712179487179
