# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [24]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_json("../data/processed/feature_dataframe.json")
feature_retrieval=pd.read_json("../data/processed/feature_retrieval_reduced.json")

In [42]:
feature_dataframe=feature_dataframe.sample(frac=1)
feature_dataframe

Unnamed: 0,source_id,target_id,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_!_difference,number_!_difference_relative,number_!_difference_normalized,...,pca_embeddding_average_diff_2,pca_embeddding_tf_idf_diff_2,pca_embeddding_average_diff_3,pca_embeddding_average_diff_4,pca_embeddding_average_diff_5,pca_embeddding_average_diff_6,pca_embeddding_average_diff_7,pca_embeddding_average_diff_8,pca_embeddding_average_diff_9,Translation
41255,2125,12982,4,1.000000,-0.060606,7,0.388889,0,0,0.0,...,-0.002556,-0.003820,0.026419,-0.070108,-0.149395,-0.018912,-0.021365,0.080145,0.004029,0
197200,17720,17676,-1,-0.500000,0.114035,8,0.800000,0,0,0.0,...,-0.023285,0.006690,0.105190,-0.097305,-0.077884,-0.043635,-0.029408,0.073068,0.022291,0
136979,11697,3888,1,0.000000,-0.076923,3,0.333333,0,0,0.0,...,-0.076853,-0.019355,-0.012643,-0.072363,-0.118291,-0.101588,0.037155,0.041971,-0.072246,0
144434,12443,4544,-1,-0.500000,0.071429,1,0.083333,0,0,0.0,...,-0.068608,-0.019808,0.079789,-0.067451,-0.050441,-0.041185,-0.055180,0.056033,-0.049097,0
198556,17855,12905,2,0.500000,-0.089286,-6,-0.214286,0,0,0.0,...,-0.041669,-0.009074,0.080245,-0.024273,-0.121719,-0.066690,0.070969,0.026449,-0.062150,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,869,869,0,0.000000,-0.011111,-1,-0.111111,0,0,0.0,...,-0.048272,-0.014800,-0.025744,-0.028717,-0.127420,-0.226481,0.072937,0.083777,-0.144790,1
167865,14786,9390,6,0.000000,-0.206897,6,0.352941,0,0,0.0,...,-0.042361,-0.006038,0.073134,-0.029651,-0.077821,-0.008269,-0.008337,0.076887,-0.018930,0
167701,14770,426,4,0.000000,-0.190476,1,0.062500,0,0,0.0,...,-0.047286,-0.010079,0.042496,-0.011377,-0.078073,-0.019256,-0.043349,0.032975,0.014765,0
6242,6242,6242,1,0.500000,-0.032353,2,0.133333,0,0,0.0,...,-0.035501,-0.007600,0.077813,-0.086015,-0.043991,-0.001368,0.003086,0.074925,0.023217,1


In [43]:
feature_retrieval

Unnamed: 0,source_id,target_id,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_!_difference,number_!_difference_relative,number_!_difference_normalized,...,pca_embeddding_average_diff_2,pca_embeddding_tf_idf_diff_2,pca_embeddding_average_diff_3,pca_embeddding_average_diff_4,pca_embeddding_average_diff_5,pca_embeddding_average_diff_6,pca_embeddding_average_diff_7,pca_embeddding_average_diff_8,pca_embeddding_average_diff_9,Translation
0,20000,20000,1,0.333333,-0.053913,-3,-0.136364,0,0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,20000,20001,1,0.333333,-0.130000,-10,-0.454545,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,20000,20002,-3,-1.000000,0.120000,-19,-0.863636,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,20000,20003,-3,-1.000000,0.120000,-18,-0.818182,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,20000,20004,-2,-0.666667,-0.005000,-15,-0.681818,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,20099,24995,4,0.000000,-0.137931,19,3.166667,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499996,20099,24996,3,0.000000,-0.176471,8,1.333333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499997,20099,24997,2,0.000000,-0.125000,8,1.333333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499998,20099,24998,3,0.000000,-0.093750,23,3.833333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


## II. Supervised Retrieval

### First iteration after dropping correlated features from analyes

In [44]:
import pickle 
file = open("../data/processed/correlated_features.pkl",'rb')
correlated_features = pickle.load(file)
file.close()
correlated_features

{'characters_avg_difference_relative',
 'cosine_similarity_tf_idf',
 'euclidean_distance_tf_idf',
 'jaccard_translation_target',
 'number_#_difference_normalized',
 'number_#_difference_relative',
 'number_$_difference_normalized',
 'number_$_difference_relative',
 'number_%_difference_normalized',
 'number_&_difference_normalized',
 "number_'_difference_normalized",
 "number_'_difference_relative",
 'number_)_difference',
 'number_)_difference_normalized',
 'number_)_difference_relative',
 'number_+_difference_normalized',
 'number_+_difference_relative',
 'number_,_difference',
 'number_,_difference_relative',
 'number_-_difference_normalized',
 'number_._difference_relative',
 'number_/_difference_normalized',
 'number_NOUN_difference',
 'number_[_difference_normalized',
 'number_[_difference_relative',
 'number_]_difference',
 'number_]_difference_normalized',
 'number_]_difference_relative',
 'number_characters_difference',
 'number_characters_difference_relative',
 'number_stopwo

In [45]:
feature_dataframe=feature_dataframe.drop(columns=correlated_features)
feature_dataframe

KeyError: '[\'jaccard_translation_target\' \'number_._difference_relative\'\n \'characters_avg_difference_relative\' \'number_%_difference_normalized\'\n \'number_[_difference_relative\' \'pca_embeddding_tf_idf_diff_3\'\n \'number_,_difference\' \'pca_embeddding_tf_idf_diff_4\'\n "number_\'_difference_relative" \'number_words_difference_normalized\'\n \'number_)_difference_normalized\' \'number_]_difference_relative\'\n \'number_/_difference_normalized\' \'pca_embeddding_tf_idf_diff_8\'\n "number_\'_difference_normalized" \'number_+_difference_relative\'\n \'number_[_difference_normalized\' \'number_stopwords_difference\'\n \'pca_embeddding_tf_idf_diff_7\' \'pca_embeddding_tf_idf_diff_9\'\n \'number_,_difference_relative\' \'number_characters_difference_relative\'\n \'number_#_difference_normalized\' \'number_-_difference_normalized\'\n \'number_characters_difference\'\n \'number_unique_words_difference_normalized\'\n \'number_unique_words_difference_relative\' \'cosine_similarity_tf_idf\'\n \'number_)_difference\' \'number_#_difference_relative\'\n \'number_unique_words_difference\' \'pca_embeddding_tf_idf_diff_5\'\n \'pca_embeddding_tf_idf_diff_6\' \'number_&_difference_normalized\'\n \'number_$_difference_relative\' \'number_+_difference_normalized\'\n \'number_]_difference\' \'number_$_difference_normalized\'\n \'euclidean_distance_tf_idf\' \'number_)_difference_relative\'\n \'pca_embeddding_tf_idf_diff_1\' \'number_]_difference_normalized\'\n \'number_NOUN_difference\'] not found in axis'

In [46]:
feature_dataframe=feature_dataframe.drop(columns=['word_mover_distance'])

KeyError: "['word_mover_distance'] not found in axis"

In [47]:
feature_dataframe.columns.difference(feature_retrieval.columns)

Index([], dtype='object')

#### drop the target label and the indexes for training and testing

In [128]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
target_test=feature_retrieval['Translation'].astype(float)
data_test=feature_retrieval.drop(columns=['Translation','source_id','target_id'])

In [86]:
# #scale data into [0,1]
# scaler = preprocessing.MinMaxScaler()
# data_train[data_train.columns] = scaler.fit_transform(data_train[data_train.columns])
# data_test[data_test.columns] = scaler.fit_transform(data_test[data_test.columns])

# Naive Bayes

In [120]:
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction)
ll=log_loss(target_test,prediction)
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))
print("The Los_loss on test set: {:.4f}".format(ll))

  _warn_prf(average, modifier, msg_start, len(result))


The Accuracy on test set: 0.9998
The F1-Score on test set: 0.0000
The Precision-Score on test set: 0.0000
The Recall-Score on test set: 0.0000
The Los_loss on test set: 0.0069


In [121]:
target_test

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
499995    0.0
499996    0.0
499997    0.0
499998    0.0
499999    0.0
Name: Translation, Length: 500000, dtype: float64

In [122]:
np.unique(prediction)

array([0.])

In [123]:
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
ll=log_loss(target_test,prediction)
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))
print("The Los_loss on test set: {:.4f}".format(ll))

The Accuracy on test set: 0.9812
The F1-Score on test set: 0.0163
The Precision-Score on test set: 0.0083
The Recall-Score on test set: 0.7800
The Los_loss on test set: 0.6487


In [124]:
sum(prediction)

9446.0

In [125]:
lr = LogisticRegression(max_iter=10000).fit(data_train, target_train)
prediction = lr.predict(data_test)
acc = accuracy_score(target_test,prediction) 
f1= f1_score(target_test,prediction) 
pr= precision_score(target_test,prediction) 
re= recall_score(target_test,prediction) 
ll=log_loss(target_test,prediction)
print("The Accuracy on test set: {:.4f}".format(acc))
print("The F1-Score on test set: {:.4f}".format(f1))
print("The Precision-Score on test set: {:.4f}".format(pr))
print("The Recall-Score on test set: {:.4f}".format(re))
print("The Los_loss on test set: {:.4f}".format(ll))

The Accuracy on test set: 0.6879
The F1-Score on test set: 0.0005
The Precision-Score on test set: 0.0002
The Recall-Score on test set: 0.3900
The Los_loss on test set: 10.7807


In [126]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_test,prediction)

array([[343898, 156002],
       [    61,     39]], dtype=int64)

In [127]:
prediction

array([1., 0., 0., ..., 0., 0., 1.])

In [83]:
lr = LogisticRegression(max_iter=10000).fit(data_train, target_train)
prediction = lr.predict_proba(data_train)