In [17]:
import time
import joblib
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from feature_extract_inference import FeatureExtract

class KmeansKnnModel:
    def __init__(self,\
                 model_version = "v2_best",
                 num_of_clusters = 30,
                 num_of_k_neibors = 20,
                 train_data_path = "data/v1_1/train_set_v1_1_more_than_55.txt",
                 dev_data_path = "data/v1_1/dev_set_v1_1_more_than_55.txt"
                ):
        self.model_version = model_version
        self.num_of_clusters = num_of_clusters
        self.num_of_k_neibors = num_of_k_neibors
        self.kmeans_model_path = \
                "checkpoints/kmeans_model_{0}_clusters_{1}.m".format(\
                    self.model_version,\
                    self.num_of_clusters)
        self.knn_model_path = \
                "checkpoints/knn_model_{0}".format(self.model_version)
        self.train_data_path = train_data_path
        self.dev_data_path = dev_data_path
        
    def trainKmeansClf(self, train_data):
        kmeans_clf = KMeans(n_clusters=self.num_of_clusters).fit(train_data)
        joblib.dump(kmeans_clf, self.kmeans_model_path)
        return kmeans_clf

    def getKmeansClfModel(self):
        return joblib.load(self.kmeans_model_path)

    def trainKnnClf(self, clf_number, X, y):
        knn_clf = KNeighborsClassifier(n_neighbors=self.num_of_k_neibors)
        knn_clf.fit(X, y)
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        joblib.dump(knn_clf, clf_path)
        return knn_clf

    def getKnnClfModel(self, clf_number):
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        return joblib.load(clf_path)

In [5]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    m = len(X)
    mini_batches = []

    num_complete_minibatches = int(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = X[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = Y[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = X[num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = Y[num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [6]:
def readData(data_path):
    label_list = []
    sentence_list = []
    with open(data_path) as data_file:
        for line in data_file.readlines():
            line = line.strip().split("\t")
            label_list.append(line[0])
            sentence_list.append(line[1])

    #mini_batches = random_mini_batches(sentence_list, label_list, 1)
    label_list = np.array(label_list)
    return sentence_list, label_list

In [7]:
kmeans_knn_learner = KmeansKnnModel()
sentence_list, label_list = readData(kmeans_knn_learner.train_data_path)

In [8]:
test_model = FeatureExtract()

In [9]:
def getFeatureList(sentence_list):
    feature_list = []
    #for batch in mini_batches:
        #feature = test_model.get_features(batch[0])
    for sentence in sentence_list:
        feature = test_model.get_features([sentence])
        feature_list.append(np.ravel(feature))
    feature_list = np.array(feature_list)
    return feature_list

In [10]:
%%time
# Training set feature list
feature_list = getFeatureList(sentence_list)
dev_feature_list = getFeatureList(dev_sentence)

CPU times: user 18min 36s, sys: 683 ms, total: 18min 37s
Wall time: 18min 37s


In [23]:
%%time
# KMeans training
kmeans_knn_learner.num_of_clusters = 
kmeans_knn_learner.num_of_k_neibors = 
kmeans_clf = kmeans_knn_learner.trainKmeansClf(feature_list)

CPU times: user 13min 24s, sys: 13.1 s, total: 13min 37s
Wall time: 2min 22s


In [24]:
# KNN training
for i in np.unique(kmeans_clf.labels_):
    print("---------------------------")
    print("KNN Classifier {0}:".format(i))
    time_start=time.time()
    cluster_index = (kmeans_clf.labels_ == i)
    knn_clf = kmeans_knn_learner.trainKnnClf(i, feature_list[cluster_index], label_list[cluster_index])
    time_end=time.time()
    print("Time spent: {0:.2f}ms".format((time_end-time_start)*1000))

---------------------------
KNN Classifier 0:
Time spent: 4.84ms
---------------------------
KNN Classifier 1:
Time spent: 21.55ms
---------------------------
KNN Classifier 2:
Time spent: 7.47ms
---------------------------
KNN Classifier 3:
Time spent: 16.59ms
---------------------------
KNN Classifier 4:
Time spent: 13.43ms
---------------------------
KNN Classifier 5:
Time spent: 4.04ms
---------------------------
KNN Classifier 6:
Time spent: 14.40ms
---------------------------
KNN Classifier 7:
Time spent: 7.38ms
---------------------------
KNN Classifier 8:
Time spent: 4.41ms
---------------------------
KNN Classifier 9:
Time spent: 24.57ms
---------------------------
KNN Classifier 10:
Time spent: 11.26ms
---------------------------
KNN Classifier 11:
Time spent: 8.89ms
---------------------------
KNN Classifier 12:
Time spent: 8.76ms
---------------------------
KNN Classifier 13:
Time spent: 6.51ms
---------------------------
KNN Classifier 14:
Time spent: 13.36ms
-------------

In [25]:
# predict
kmeans_clf = kmeans_knn_learner.getKmeansClfModel()

dev_sentence, dev_label = readData(kmeans_knn_learner.dev_data_path)

In [26]:
%%time
kmeans_predicted_list = kmeans_clf.predict(dev_feature_list)

CPU times: user 55.8 ms, sys: 4.15 ms, total: 60 ms
Wall time: 14.6 ms


In [27]:
%%time
unique_clusters = np.unique(kmeans_clf.labels_)
correct = 0
for label in unique_clusters:
    index_list = (kmeans_predicted_list==label)
    temp_feature_list = dev_feature_list[index_list]
    temp_label_list = dev_label[index_list]
    knn_clf = kmeans_knn_learner.getKnnClfModel(label)
    knn_predicted_list = knn_clf.predict(temp_feature_list)
    temp_correct = sum(temp_label_list==knn_predicted_list)
    temp_len = len(temp_label_list)
    correct += temp_correct
    temp_precision = temp_correct / temp_len
    print("------------------------------------------------------")
    print("KNN classifier {0} precision: {1}/{2}={3:.2f}".format(label, temp_correct, temp_len, temp_precision))
precision = correct / len(dev_label)
print("Dev set precision: {0}/{1}={2:.2f}".format(correct ,dev_label, precision))

------------------------------------------------------
KNN classifier 0 precision: 10/81=0.12
------------------------------------------------------
KNN classifier 1 precision: 19/263=0.07
------------------------------------------------------
KNN classifier 2 precision: 14/154=0.09
------------------------------------------------------
KNN classifier 3 precision: 15/320=0.05
------------------------------------------------------
KNN classifier 4 precision: 23/276=0.08
------------------------------------------------------
KNN classifier 5 precision: 5/67=0.07
------------------------------------------------------
KNN classifier 6 precision: 10/287=0.03
------------------------------------------------------
KNN classifier 7 precision: 15/172=0.09
------------------------------------------------------
KNN classifier 8 precision: 4/94=0.04
------------------------------------------------------
KNN classifier 9 precision: 18/417=0.04
------------------------------------------------------
