In [26]:
import time
import joblib
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from feature_extract_inference import FeatureExtract

class KmeansKnnModel:
    def __init__(self,
                 model_version = "v1_1",
                 num_of_clusters = 10,
                 num_of_k_neibors = 30,
                 train_data_path = "data/v1_1/train_set_v1_1_more_than_55.txt"):
                 dev_data_path = "data/v1_1/dev_set_v1_1_more_than_55.txt"
        self.model_version = model_version
        self.num_of_clusters = num_of_clusters
        self.num_of_k_neibors = num_of_k_neibors
        self.kmeans_model_path = \
                "checkpoints/kmeans_model_{0}_clusters_{1}.m".format(\
                    self.model_version,\
                    self.num_of_clusters)
        self.knn_model_path = \
                "checkpoints/knn_model_{0}".format(self.model_version)
        self.train_data_path = train_data_path
        self.dev_data_path = dev_data_path
        
    def trainKmeansClf(self, train_data):
        kmeans_clf = KMeans(n_clusters=self.num_of_clusters).fit(train_data)
        joblib.dump(kmeans_clf, self.kmeans_model_path)
        return kmeans_clf

    def getKmeansClfModel(self):
        return joblib.load(self.kmeans_model_path)

    def trainKnnClf(self, clf_number, X, y):
        knn_clf = KNeighborsClassifier(n_neighbors=self.num_of_k_neibors)
        knn_clf.fit(X, y)
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        joblib.dump(knn_clf, clf_path)
        return knn_clf

    def getKnnClfModel(self, clf_number):
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        return joblib.load(clf_path)

In [27]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    m = len(X)
    mini_batches = []

    num_complete_minibatches = int(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = X[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = Y[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = X[num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = Y[num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [28]:
def readData(data_path):
    label_list = []
    sentence_list = []
    with open(data_path) as data_file:
        for line in data_file.readlines():
            line = line.strip().split("\t")
            label_list.append(line[0])
            sentence_list.append(line[1])

    #mini_batches = random_mini_batches(sentence_list, label_list, 1)
    label_list = np.array(label_list)
    return sentence_list, label_list

In [29]:
kmeans_knn_learner = KmeansKnnModel()
sentence_list, label_list = readData(kmeans_knn_learner.train_data_path)

In [30]:
test_model = FeatureExtract()

In [31]:
def getFeatureList(sentence_list):
    feature_list = []
    #for batch in mini_batches:
        #feature = test_model.get_features(batch[0])
    for sentence in sentence_list:
        feature = test_model.get_features([sentence])
        feature_list.append(np.ravel(feature))
    feature_list = np.array(feature_list)
    return feature_list

In [32]:
%%time
# Training set feature list
feature_list = getFeatureList(sentence_list)

CPU times: user 3min 13s, sys: 168 ms, total: 3min 13s
Wall time: 3min 13s


In [33]:
%%time
# KMeans training
kmeans_clf = kmeans_knn_learner.trainKmeansClf(feature_list)

CPU times: user 18.3 s, sys: 260 ms, total: 18.6 s
Wall time: 5.58 s


In [34]:
# KNN training
for i in np.unique(kmeans_clf.labels_):
    print("---------------------------")
    print("KNN Classifier {0}:".format(i))
    time_start=time.time()
    cluster_index = (kmeans_clf.labels_ == i)
    knn_clf = kmeans_knn_learner.trainKnnClf(i, feature_list[cluster_index], label_list[cluster_index])
    time_end=time.time()
    print("Time spent: {0:.2f}ms".format((time_end-time_start)*1000))

---------------------------
KNN Classifier 0:
Time spent: 8.97ms
---------------------------
KNN Classifier 1:
Time spent: 10.21ms
---------------------------
KNN Classifier 2:
Time spent: 8.74ms
---------------------------
KNN Classifier 3:
Time spent: 10.58ms
---------------------------
KNN Classifier 4:
Time spent: 9.25ms
---------------------------
KNN Classifier 5:
Time spent: 10.71ms
---------------------------
KNN Classifier 6:
Time spent: 6.55ms
---------------------------
KNN Classifier 7:
Time spent: 11.11ms
---------------------------
KNN Classifier 8:
Time spent: 8.96ms
---------------------------
KNN Classifier 9:
Time spent: 7.83ms


In [35]:
# predict
kmeans_clf = kmeans_knn_learner.getKmeansClfModel()

dev_sentence, dev_label = readData(kmeans_knn_learner.dev_data_path)

In [36]:
%%time
# dev set feature list
dev_feature_list = getFeatureList(dev_sentence)

CPU times: user 23 s, sys: 32 ms, total: 23 s
Wall time: 22.6 s


In [37]:
%%time
kmeans_predicted_list = kmeans_clf.predict(dev_feature_list)

CPU times: user 19.5 ms, sys: 4.19 ms, total: 23.7 ms
Wall time: 5.65 ms


In [39]:
%%time
unique_clusters = np.unique(kmeans_clf.labels_)
correct = 0
for label in unique_clusters:
    index_list = (kmeans_predicted_list==label)
    temp_feature_list = dev_feature_list[index_list]
    temp_label_list = dev_label[index_list]
    knn_clf = kmeans_knn_learner.getKnnClfModel(label)
    knn_predicted_list = knn_clf.predict(temp_feature_list)
    temp_correct = sum(temp_label_list==knn_predicted_list)
    temp_len = len(temp_label_list)
    correct += temp_correct
    temp_precision = temp_correct / temp_len
    print("------------------------------------------------------")
    print("KNN classifier {0} precision: {1}/{2}={3:.2f}".format(label, temp_correct, temp_len, temp_precision))
precision = correct / len(dev_label)
print("Dev set precision: {0:.2f}".format(precision))

------------------------------------------------------
KNN classifier 0 precision: 28/175=0.16
------------------------------------------------------
KNN classifier 1 precision: 21/140=0.15
------------------------------------------------------
KNN classifier 2 precision: 18/207=0.09
------------------------------------------------------
KNN classifier 3 precision: 37/216=0.17
------------------------------------------------------
KNN classifier 4 precision: 25/224=0.11
------------------------------------------------------
KNN classifier 5 precision: 50/216=0.23
------------------------------------------------------
KNN classifier 6 precision: 24/148=0.16
------------------------------------------------------
KNN classifier 7 precision: 75/237=0.32
------------------------------------------------------
KNN classifier 8 precision: 25/178=0.14
------------------------------------------------------
KNN classifier 9 precision: 22/178=0.12
Dev set precision: 0.17
CPU times: user 957 ms, sy