In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Load embedding feature of training and dev set
# Load original dataset
k = 1000
train_feature_path = "train_feature.npy"
train_set_path = "../data/v1_5/train_set_v1_5.txt"
dev_feature_path = "dev_feature.npy"
dev_set_path = "../data/v1_5/dev_set_v1_5.txt"

In [None]:
%%time
# Extract sentence and label of training and dev set
train_feature = np.load(train_feature_path)
train_set = pd.read_csv(train_set_path, sep='\t', header=None)
train_set = np.array(train_set)
train_sentence = train_set[:,1]
train_label = (train_set[:,0]).astype('int')
split_set = pd.read_csv(dev_set_path, sep='\t', header=None)
split_set = np.array(split_set)
split_sentence = split_set[:,1]
split_label = (split_set[:,0]).astype('int')
print("[SUCCESS] Loaded data")

split_feature = np.load(dev_feature_path)

In [None]:
%%time
# Fit data to KNN Classifier
print("[INFO] Fit knn model")
knn_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
knn_clf.fit(train_feature, train_label)
print("[SUCCESS] Successfully fitted knn model")

In [None]:
%%time
# Finding k neighbors using KNN Classifier fitted above
print("[INFO] Finding kneighbors")
k_neighbors_list = knn_clf.kneighbors(X=split_feature, return_distance=False)
print("[SUCCESS] Successfully Finded kneighbors")

In [10]:
def getSplitPairDataFrame(true_sentence_list,
                          candidate_sentence_list,
                          candidate_label_true_false,
                          true_label_list,
                          candidate_label_list):
    return pd.DataFrame({'true_sentence':true_sentence_list,
                         'candidate_sentence':candidate_sentence_list,
                         'pair_result':candidate_label_true_false,
                         'true_label':true_label_list,
                         'candidate_label':candidate_label_list})

In [None]:
%%time
# output K candidates pair file, and save in output_pair_file_path
output_pair_file_path = "split_pair_dev_k_{0}.txt".format(k)
true_label_list = split_label.repeat(k)
true_sentence_list = split_sentence.repeat(k)

candidate_index_list = k_neighbors_list.ravel()
candidate_label_list = train_label[candidate_index_list]
candidate_label_true_false = (candidate_label_list == true_label_list).astype('int')
candidate_sentence_list = train_sentence[candidate_index_list]
split_data = getSplitPairDataFrame(true_sentence_list,
                                   candidate_sentence_list,
                                   candidate_label_true_false,
                                   true_label_list,
                                   candidate_label_list)
split_data.to_csv(output_pair_file_path, sep='\t', index=False)

In [5]:
k = 1000
split_pair_path = "../data/split_600w_2epoch/split_pair_dev_k_600w_2epoch_{0}.txt".format(k)
split_data = pd.read_csv(split_pair_path, sep='\t')

In [6]:
%%time
total_len = len(split_data)
unique_len = int(total_len/k)
original_shape = (unique_len, k)
true_sentence = np.array(split_data.true_sentence).reshape(original_shape)
candidate_sentence = np.array(split_data.candidate_sentence).reshape(original_shape)
pair_result = np.array(split_data.pair_result).reshape(original_shape)
true_label = np.array(split_data.true_label).reshape(original_shape)
candidate_label = np.array(split_data.candidate_label).reshape(original_shape)

CPU times: user 1.58 s, sys: 312 ms, total: 1.89 s
Wall time: 1.47 s


In [12]:
# Get k_top recall of KNN Classifier, k_top <= k.
k_top = 100
print("Recall of knn k_top = {0}".format(k_top))
np.mean(np.max(pair_result[:,:k_top], axis=1))

Recall of knn k_top = 100


0.49361702127659574

In [13]:
# Save k_top Split for probability file
k_top_pair_file_path = "../data/split_600w_2epoch/k_top_pair_file_{0}_of_{1}.csv".format(k_top, k)
k_top_split_data = getSplitPairDataFrame(true_sentence[:,:k_top].ravel(),
                                         candidate_sentence[:,:k_top].ravel(),
                                         pair_result[:,:k_top].ravel(),
                                         true_label[:,:k_top].ravel(),
                                         candidate_label[:,:k_top].ravel())
k_top_split_data.to_csv(k_top_pair_file_path, sep='\t', index=False)

In [None]:
# from probability file, get k_top candidates from knn k
# The k_top probablity is computed by the first model
probability_file_path = "../data/split_600w_2epoch/dev_results_k_{0}_of_k_{1}.csv".format(k_top, k)

probability_file = pd.read_csv(probability_file_path, sep='\t',header=None).iloc[:,1]
probability_mat = np.array(probability_file).reshape(original_shape)

In [None]:
def getSortedByProbability(original_shape, candidate_mat, probability_mat):
    sorted_candidate_mat = np.zeros((original_shape)).astype('int')
    sorted_probability_mat = np.zeros((original_shape))
    for i in range(original_shape[0]):
        sorted_index = probability_mat[i].argsort()[::-1]
        sorted_candidate_mat[i] = candidate_mat[i, sorted_index]
        sorted_probability_mat[i] = probability_mat[i, sorted_index] 
    return (sorted_candidate_mat, sorted_probability_mat)

In [None]:
%%time
# Sort according to probability, descending
sorted_candidate_label, sorted_probability_mat = \
        getSortedCandidateLabel(original_shape,
                                candidate_label,
                                probability_mat)

In [None]:
def getFilteredLabel(weights, label_list):
    unique_label = np.unique(label_list)
    final_weights = np.zeros((unique_label.shape))
    for i, label in enumerate(unique_label):
        label_index = (label == label_list)
        final_weights[i] = sum(weights[label_index])
    sorted_index = final_weights.argsort()[::-1]
    return unique_label[sorted_index[0]]

def getWeightedLabel(weights, labels, threshold, filter=True):
    if filter:
        if weights[0] <= threshold:
            return labels[0]
        filtered_index = (weights > threshold)
        return getFilteredLabel(weights[filtered_index], labels[filtered_index])
    else:
        return getFilteredLabel(weights, labels)

def computeAccuracy(predicted, true_label):
    accuracy = accuracy_score(true_label, predicted)
    print("accuracy: {0}".format(accuracy))
    
def getOutput(mat, index, top):
    return mat[index, :top].ravel()

In [None]:
%%time
# Get top candidates from probability matrix, according to the probablility
# And compute the recall of "top" candidate with "top" probability
top = 20
probability_threshold = 0.5
new_candidate_list = np.zeros((unique_len, 1)).astype('int')
for i in range(unique_len):
    new_candidate_list[i] = \
            getWeightedLabel(
                    sorted_probability_mat[i,:top],
                    sorted_candidate_label[i,:top],
                    probability_threshold, False)
computeAccuracy(new_candidate_list, true_label[:,0])

In [None]:
%%time
# Store the positive pair(Recall pair) which contains true label in "top" candidates
output_positive_pair_file_path = "data/split_600w_2epoch/positive_pair_dev_600w_2epoch_top_{0}_from_k_top_{1}_k_{2}_.txt".format(top, k_top, top)
positive_index = (np.max(sorted_pair_result[:,:top], axis=1) == 1)
output_true_sentence = getOutput(true_sentence, positive_index, top)
output_candidate_sentence =  getOutput(sorted_candidate_sentence, positive_index, top)
output_pair_result = getOutput(sorted_pair_result, positive_index, top)
output_true_label = getOutput(true_label, positive_index, top)
output_candidate_label = getOutput(sorted_candidate_label, positive_index, top)
positive_pair_df = getSplitPairDataFrame(output_true_sentence,
                                         output_candidate_sentence,
                                         output_pair_result,
                                         output_true_label,
                                         output_candidate_label)                         
positive_pair_df.to_csv(output_positive_pair_file_path, sep='\t', index=False) 

In [None]:
positive_total_len = len(positive_pair_df)
positive_unique_len = int(total_len/top)
positive_original_shape = (unique_len, top)
positive_true_label = np.array(positive_pair_df.true_label).reshape(positive_original_shape)
positive_pair_result = np.array(positive_pair_df.pair_result).reshape(positive_original_shape)

In [None]:
# The k_top probablity is computed by the first model
positive_probability_file_path = "../data/split_600w_2epoch/positive_pair_dev_results_600w_2epoch_top_{0}_from_k_top_{1}_k_{2}_.txt".format(top, k_top, top)

positive_probability_file = pd.read_csv(positive_probability_file_path, sep='\t',header=None).iloc[:,1]
positive_probability_mat = np.array(positive_probability_file).reshape(positive_original_shape)

In [None]:
%%time
# Sort according to probability, descending
sorted_positive_pair_result, sorted_positive_probability_mat = \
        getSortedCandidateLabel(positive_original_shape,
                                positive_pair_result,
                                positive_probability_mat)

In [None]:
accuracy_top = 1
print("Accuracy: ")
np.mean(np.max(sorted_positive_pair_result[:,:accuracy_top], axis=1))