In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from extract_v6 import FeatureExtract

In [2]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    m = len(X)
    mini_batches = []

    num_complete_minibatches = int(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = X[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = Y[k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = X[num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = Y[num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def readData(data_path, mini_batch_size=1):
    df_data = pd.read_csv(data_path, sep='\t', header=None)
    df_data = np.array(df_data)

    label_list = (df_data[:,0]).astype('int')
    sentence_list = df_data[:,1]

    if 1 == mini_batch_size:
        return sentence_list, label_list
    else:
        mini_batches = random_mini_batches(sentence_list, label_list, mini_batch_size)
        return mini_batches
    
def getFeatureList(sentence_list, use_batch=False):
    if use_batch:
        first_sencence_list = sentence_list[0][0]
        feature_list = test_model.get_features(first_sencence_list)
        for batch in sentence_list[1:]:
            feature = test_model.get_features(list(batch[0]))
            feature_list = np.concatenate((feature_list, feature))
    else:
        feature_list = []
        for sentence in sentence_list:
            feature = test_model.get_features([sentence])
            feature_list.append(np.ravel(feature))
        feature_list = np.array(feature_list)
    return feature_list

In [3]:
model_version = "margin5.0_v6_epoch2"
model_path = "checkpoints/{0}.pth".format(model_version)
train_data_path = "lemonBearData/train_set_v1.txt"
dev_data_path = "lemonBearData/dev_set_v1.txt"

sentence_list, label_list = readData(train_data_path)
dev_sentence, dev_label = readData(dev_data_path)

In [4]:
%%time
# Training and Dev set feature list
test_model = FeatureExtract(checkpoints_path=model_path)
mini_batches  = readData(train_data_path, 64)
feature_list = getFeatureList(mini_batches, True)

Extract Model V6 FeatureOnly
CPU times: user 42min 35s, sys: 5min 30s, total: 48min 6s
Wall time: 10min 28s


In [5]:
%%time
dev_mini_batches  = readData(dev_data_path, 64)
dev_feature_list = getFeatureList(dev_mini_batches, True)

CPU times: user 4min 36s, sys: 21.2 s, total: 4min 58s
Wall time: 1min 4s


In [8]:
train_feature_path = "lemonBearData/train_{0}".format(model_version)
dev_feature_path = "lemonBearData/dev_{0}".format(model_version)

np.save(train_feature_path, feature_list)
np.save(dev_feature_path, dev_feature_list)

In [9]:
%%time
k = 13
knn_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
knn_clf.fit(feature_list, label_list)
predicted = knn_clf.predict(dev_feature_list)
accuracy = sum(predicted == dev_label)/len(predicted)
print(accuracy)

0.09065994906433396
CPU times: user 43min 31s, sys: 4.38 s, total: 43min 36s
Wall time: 8min 50s


In [10]:
%%time
k=20
k_neighbors_list = knn_clf.kneighbors(X=dev_feature_list, n_neighbors=k, return_distance=False)
k_neigbbors_path = "lemonBearData/k_neighbors_list_k_eq_{0}".format(k)
np.save(k_neigbbors_path, k_neighbors_list)

CPU times: user 44min 19s, sys: 4.09 s, total: 44min 23s
Wall time: 8min 49s


In [12]:
k_neigbbors_npy_path = "lemonBearData/k_neighbors_list_k_eq_{0}.npy".format(k)
k_neighbors_list = np.load(k_neigbbors_npy_path)
k_neighbors_index_list = k_neighbors_list
label_k_neighbors_np = []
for i, index_list in enumerate(k_neighbors_index_list):
    line = []
    line.append(dev_label[i])
    candidate = list(label_list[index_list.astype('int')])
    line.append(candidate)
    label_k_neighbors_np.append(line)
label_k_neighbors_np = np.array(label_k_neighbors_np)

In [13]:
true_label_list = label_k_neighbors_np[:,0]
candidate_label_list = label_k_neighbors_np[:,1]
counter = 0
for i, label in enumerate(true_label_list):
    if label in candidate_label_list[i]:
        counter += 1
candidate_accuracy = counter / len(true_label_list)
print("recall:{0}".format(candidate_accuracy))

recall:0.15665485549773003


In [14]:
counter = 0
for i, label in enumerate(true_label_list):
    counter += len(np.unique(candidate_label_list[i]))
avg_user_counter = counter / len(candidate_label_list)
print(avg_user_counter)

18.53161333185694
