In [1]:
import time
import joblib
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from feature_extract_inference import FeatureExtract

class KmeansKnnModel:
    def __init__(self,\
                 model_version = "v2_best",
                 num_of_clusters = 30,
                 num_of_k_neibors = 20,
                 train_data_path = "data/v1_1/train_set_v1_1_more_than_55.txt",
                 dev_data_path = "data/v1_1/dev_set_v1_1_more_than_55.txt"
                ):
        self.model_version = model_version
        self.num_of_clusters = num_of_clusters
        self.num_of_k_neibors = num_of_k_neibors
        self.kmeans_model_path = \
                "checkpoints/kmeans_model_{0}_clusters_{1}.m".format(\
                    self.model_version,\
                    self.num_of_clusters)
        self.knn_model_path = \
                "checkpoints/knn_model_{0}".format(self.model_version)
        self.train_data_path = train_data_path
        self.dev_data_path = dev_data_path
        
    def trainKmeansClf(self, train_data):
        kmeans_clf = KMeans(n_clusters=self.num_of_clusters).fit(train_data)
        joblib.dump(kmeans_clf, self.kmeans_model_path)
        return kmeans_clf

    def getKmeansClfModel(self):
        return joblib.load(self.kmeans_model_path)

    def trainKnnClf(self, clf_number, X, y):
        knn_clf = KNeighborsClassifier(n_neighbors=self.num_of_k_neibors)
        knn_clf.fit(X, y)
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        joblib.dump(knn_clf, clf_path)
        return knn_clf

    def getKnnClfModel(self, clf_number):
        clf_path = "{0}_cluster_{1}.m".format(self.knn_model_path, clf_number)
        return joblib.load(clf_path)

In [2]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    m = len(X)
    mini_batches = []

    num_complete_minibatches = int(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = X[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = Y[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = X[num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = Y[num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [18]:
def readData(data_path, mini_batch_size=1):
    label_list = []
    sentence_list = []
    with open(data_path) as data_file:
        for line in data_file.readlines():
            line = line.strip().split("\t")
            label_list.append(line[0])
            sentence_list.append(line[1])

    if 1 == mini_batch_size:
        label_list = np.array(label_list)
        return sentence_list, label_list
    else:
        mini_batches = random_mini_batches(sentence_list, label_list, mini_batch_size)
        return mini_batches

In [4]:
kmeans_knn_learner = KmeansKnnModel()
sentence_list, label_list = readData(kmeans_knn_learner.train_data_path)

In [5]:
test_model = FeatureExtract()

In [34]:
def getFeatureList(sentence_list, use_batch=False):
    if use_batch:
        first_sencence_list = sentence_list[0][0]
        feature_list = test_model.get_features(first_sencence_list)
        for batch in sentence_list[1:]:
            feature = test_model.get_features(list(batch[0]))
            feature_list = np.concatenate((feature_list, feature))
    else:
        feature_list = []
        for sentence in sentence_list:
            feature = test_model.get_features([sentence])
            feature_list.append(np.ravel(feature))
        feature_list = np.array(feature_list)
    return feature_list

In [37]:
%%time
mini_batches  = readData(kmeans_knn_learner.train_data_path, 256)
feature_list = getFeatureList(mini_batches, True)

CPU times: user 7min 48s, sys: 38.1 s, total: 8min 26s
Wall time: 2min 42s


In [24]:
list(mini_batches[0])

[['Get paid immediately! Pay member to member! Over $1 million paid out! Take FREE tour for details!',
  "Found on Google: Daniel Radcliffe is ready to move on from his 'Harry Potter' days. - New York ..",
  'criminal lawyer,dui lawyer,criminal defense attorney',
  'Saw "Blind Side" - John Lee Hancock really services this type of story well as he did with "The Rookie." Nicely done. Simple, effective.',
  'he sure is!! Punks!',
  '#FollowFriday @handle @handle @handle @handle @handle @handle',
  "UTEP Prof. offers perspective of #border's #drug war in discussion of book release: see article",
  'You are my freaking HERO. I have wanted to read that script for years.',
  "haha i'm down",
  "Guess what I'm having for lunch.....TACOS!!!!!....and I'm serious about it!!!!",
  "send us an email at with details of what you're looking for and check the site www.custompieces.com",
  'Westwood at career best in golf rankings',
  '#MM - Bill 4 President Mixtape',
  'All Star Entrepreneurs Offer Adv

In [9]:
%%time
# Training set feature list
feature_list = getFeatureList(sentence_list)
dev_sentence, dev_label = readData(kmeans_knn_learner.dev_data_path)
dev_feature_list = getFeatureList(dev_sentence)

CPU times: user 21min 9s, sys: 800 ms, total: 21min 10s
Wall time: 21min 10s


In [10]:
%%time
# KMeans training
clusters_list = [1,2,3,4,5,6,7,8,9,10]
neibors_list = [1,2,3,4,5]
for num_of_clusters in clusters_list:
    kmeans_knn_learner.num_of_clusters = num_of_clusters
    kmeans_clf = kmeans_knn_learner.trainKmeansClf(feature_list)
    for num_of_k_neibors in neibors_list:
        kmeans_knn_learner.num_of_k_neibors = num_of_k_neibors
        # KNN training
        for i in np.unique(kmeans_clf.labels_):
            cluster_index = (kmeans_clf.labels_ == i)
            knn_clf = kmeans_knn_learner.trainKnnClf(i, feature_list[cluster_index], label_list[cluster_index])
        
        # Predicting
        kmeans_predicted_list = kmeans_clf.predict(dev_feature_list)
        correct = 0
        for label in np.unique(kmeans_clf.labels_):
            index_list = (kmeans_predicted_list==label)
            temp_feature_list = dev_feature_list[index_list]
            temp_label_list = dev_label[index_list]
            knn_clf = kmeans_knn_learner.getKnnClfModel(label)
            knn_predicted_list = knn_clf.predict(temp_feature_list)
            temp_correct = sum(temp_label_list==knn_predicted_list)
            temp_len = len(temp_label_list)
            correct += temp_correct
            temp_precision = temp_correct / temp_len
            #print("------------------------------------------------------")
            #print("KNN classifier {0} precision: {1}/{2}={3:.3f}".format(label, temp_correct, temp_len, temp_precision))
        precision = correct / len(dev_label)
        print("-----------------------------------------------------------")
        print("num_of_clusters: {0}".format(num_of_clusters))
        print("num_of_k_neibors: {0}".format(num_of_k_neibors))
        print("Dev set precision: {0}/{1}={2:.3f}".format(correct ,len(dev_label), precision))

-----------------------------------------------------------
num_of_clusters: 1
num_of_k_neibors: 1
Dev set precision: 1325/11135=0.119
-----------------------------------------------------------
num_of_clusters: 1
num_of_k_neibors: 2
Dev set precision: 1121/11135=0.101
-----------------------------------------------------------
num_of_clusters: 1
num_of_k_neibors: 3
Dev set precision: 1056/11135=0.095
-----------------------------------------------------------
num_of_clusters: 1
num_of_k_neibors: 4
Dev set precision: 1041/11135=0.093
-----------------------------------------------------------
num_of_clusters: 1
num_of_k_neibors: 5
Dev set precision: 1027/11135=0.092
-----------------------------------------------------------
num_of_clusters: 2
num_of_k_neibors: 1
Dev set precision: 1332/11135=0.120
-----------------------------------------------------------
num_of_clusters: 2
num_of_k_neibors: 2
Dev set precision: 1116/11135=0.100
-------------------------------------------------------