# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [91]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_json("../data/processed/feature_dataframe.json")
feature_retrieval=pd.read_json("../data/processed/feature_retrieval_reduced.json")

In [3]:
feature_dataframe=feature_dataframe.sample(frac=1)
feature_dataframe

Unnamed: 0,source_id,target_id,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_words_difference_normalized,number_unique_words_difference,number_unique_words_difference_relative,...,pca_embeddding_tf_idf_diff_5,pca_embeddding_average_diff_6,pca_embeddding_tf_idf_diff_6,pca_embeddding_average_diff_7,pca_embeddding_tf_idf_diff_7,pca_embeddding_average_diff_8,pca_embeddding_tf_idf_diff_8,pca_embeddding_average_diff_9,pca_embeddding_tf_idf_diff_9,Translation
12519,12519,12519,0,0.000000,0.041667,2,0.400000,-0.041667,2,0.400000,...,-0.043918,-0.096991,-0.044375,-0.025539,-0.006743,0.066192,0.025650,0.013417,0.014272,1
207924,18792,18765,3,0.000000,-0.214286,2,0.222222,0.214286,2,0.222222,...,-0.023575,-0.017785,-0.007065,-0.045629,-0.011245,0.033313,0.012925,0.018595,0.007838,0
82631,6263,1427,6,1.500000,-0.118272,2,0.064516,0.118272,3,0.103448,...,-0.007633,-0.089750,-0.015188,-0.002623,0.001570,0.018371,0.003459,-0.096774,-0.014227,0
207415,18741,16127,9,0.000000,-0.214286,26,3.714286,0.214286,22,3.142857,...,-0.024444,-0.015724,-0.009262,0.031318,-0.000521,0.052823,0.012674,-0.031026,-0.007229,0
100896,8089,10064,1,1.000000,-0.041176,2,0.125000,0.041176,2,0.125000,...,-0.039010,-0.033639,-0.007920,-0.008292,-0.000357,0.050477,0.013727,-0.096136,-0.021704,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26438,643,17061,-4,-0.666667,0.154762,1,0.055556,-0.154762,2,0.117647,...,-0.008973,-0.056602,-0.013608,-0.002199,-0.001880,0.032957,0.003200,-0.018902,-0.003246,0
75613,5561,4136,2,0.000000,-0.125000,4,0.400000,0.125000,4,0.400000,...,-0.028681,-0.019888,-0.006657,0.026404,0.004963,0.079189,0.022794,-0.028954,-0.007140,0
204878,18487,1685,1,0.333333,0.014706,4,0.444444,-0.014706,4,0.444444,...,-0.032040,0.047045,0.012145,-0.043183,-0.018527,-0.012007,-0.003111,-0.030446,-0.013045,0
31070,1107,14931,4,2.000000,-0.082237,9,0.529412,0.082237,10,0.666667,...,-0.015241,-0.093937,-0.017974,0.053761,0.010436,0.056365,0.012181,-0.081086,-0.014172,0


In [4]:
feature_retrieval

Unnamed: 0,source_id,target_id,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_!_difference,number_!_difference_relative,number_!_difference_normalized,...,pca_embeddding_average_diff_2,pca_embeddding_tf_idf_diff_2,pca_embeddding_average_diff_3,pca_embeddding_average_diff_4,pca_embeddding_average_diff_5,pca_embeddding_average_diff_6,pca_embeddding_average_diff_7,pca_embeddding_average_diff_8,pca_embeddding_average_diff_9,Translation
0,20000,20000,1,0.333333,-0.053913,-3,-0.136364,0,0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,20000,20001,1,0.333333,-0.130000,-10,-0.454545,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,20000,20002,-3,-1.000000,0.120000,-19,-0.863636,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,20000,20003,-3,-1.000000,0.120000,-18,-0.818182,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,20000,20004,-2,-0.666667,-0.005000,-15,-0.681818,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,20099,24995,4,0.000000,-0.137931,19,3.166667,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499996,20099,24996,3,0.000000,-0.176471,8,1.333333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499997,20099,24997,2,0.000000,-0.125000,8,1.333333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
499998,20099,24998,3,0.000000,-0.093750,23,3.833333,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


## II. Supervised Retrieval

### First iteration after dropping correlated features from analyes

In [5]:
import pickle 
file = open("../data/processed/correlated_features.pkl",'rb')
correlated_features = pickle.load(file)
file.close()
correlated_features

{'characters_avg_difference_relative',
 'cosine_similarity_tf_idf',
 'euclidean_distance_tf_idf',
 'jaccard_translation_target',
 'number_#_difference_normalized',
 'number_#_difference_relative',
 'number_$_difference_normalized',
 'number_$_difference_relative',
 'number_%_difference_normalized',
 'number_&_difference_normalized',
 "number_'_difference_normalized",
 "number_'_difference_relative",
 'number_)_difference',
 'number_)_difference_normalized',
 'number_)_difference_relative',
 'number_+_difference_normalized',
 'number_+_difference_relative',
 'number_,_difference',
 'number_,_difference_relative',
 'number_-_difference_normalized',
 'number_._difference_relative',
 'number_/_difference_normalized',
 'number_NOUN_difference',
 'number_[_difference_normalized',
 'number_[_difference_relative',
 'number_]_difference',
 'number_]_difference_normalized',
 'number_]_difference_relative',
 'number_characters_difference',
 'number_characters_difference_relative',
 'number_stopwo

In [6]:
feature_dataframe=feature_dataframe.drop(columns=correlated_features)
feature_dataframe

Unnamed: 0,source_id,target_id,number_punctuations_total_difference,number_punctuations_total_difference_relative,number_punctuations_total_difference_normalized,number_words_difference,number_words_difference_relative,number_!_difference,number_!_difference_relative,number_!_difference_normalized,...,pca_embeddding_average_diff_2,pca_embeddding_tf_idf_diff_2,pca_embeddding_average_diff_3,pca_embeddding_average_diff_4,pca_embeddding_average_diff_5,pca_embeddding_average_diff_6,pca_embeddding_average_diff_7,pca_embeddding_average_diff_8,pca_embeddding_average_diff_9,Translation
12519,12519,12519,0,0.000000,0.041667,2,0.400000,0,0,0.000000,...,0.017799,0.003041,0.080346,-0.126860,-0.064814,-0.096991,-0.025539,0.066192,0.013417,1
207924,18792,18765,3,0.000000,-0.214286,2,0.222222,0,0,0.000000,...,-0.098649,-0.025088,0.096148,-0.007392,-0.073655,-0.017785,-0.045629,0.033313,0.018595,0
82631,6263,1427,6,1.500000,-0.118272,2,0.064516,0,0,0.000000,...,-0.048123,-0.008813,0.096650,-0.091540,-0.069025,-0.089750,-0.002623,0.018371,-0.096774,0
207415,18741,16127,9,0.000000,-0.214286,26,3.714286,0,0,0.000000,...,0.050668,0.022798,0.069393,-0.036044,-0.057030,-0.015724,0.031318,0.052823,-0.031026,0
100896,8089,10064,1,1.000000,-0.041176,2,0.125000,0,0,0.000000,...,0.029271,0.007504,0.035265,-0.048938,-0.166031,-0.033639,-0.008292,0.050477,-0.096136,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26438,643,17061,-4,-0.666667,0.154762,1,0.055556,0,0,0.000000,...,-0.043041,-0.013412,0.045469,-0.028166,-0.045891,-0.056602,-0.002199,0.032957,-0.018902,0
75613,5561,4136,2,0.000000,-0.125000,4,0.400000,0,0,0.000000,...,-0.031168,-0.007176,0.043947,-0.020836,-0.092995,-0.019888,0.026404,0.079189,-0.028954,0
204878,18487,1685,1,0.333333,0.014706,4,0.444444,-1,-1,0.083333,...,-0.069658,-0.014027,0.012100,0.070576,-0.093658,0.047045,-0.043183,-0.012007,-0.030446,0
31070,1107,14931,4,2.000000,-0.082237,9,0.529412,0,0,0.000000,...,0.020291,0.003643,-0.004706,-0.055687,-0.072022,-0.093937,0.053761,0.056365,-0.081086,0


In [7]:
feature_dataframe=feature_dataframe.drop(columns=['word_mover_distance'])

In [8]:
feature_dataframe.columns.difference(feature_retrieval.columns)

Index([], dtype='object')

#### drop the target label and the indexes for training and testing

In [9]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
target_test=feature_retrieval['Translation'].astype(float)
data_test=feature_retrieval.drop(columns=['Translation','source_id','target_id'])

In [10]:
#scale data into [0,1]
scaler = preprocessing.MinMaxScaler()
data_train[data_train.columns] = scaler.fit_transform(data_train[data_train.columns])
data_test[data_test.columns] = scaler.fit_transform(data_test[data_test.columns])

# Naive Bayes

In [93]:
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict_proba(data_test)
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))
# acc = accuracy_score(target_test,prediction) 
# f1= f1_score(target_test,prediction) 
# pr= precision_score(target_test,prediction) 
# re= recall_score(target_test,prediction)
# ll=log_loss(target_test,prediction)
# print("The Accuracy on test set: {:.4f}".format(acc))
# print("The F1-Score on test set: {:.4f}".format(f1))
# print("The Precision-Score on test set: {:.4f}".format(pr))
# print("The Recall-Score on test set: {:.4f}".format(re))
# print("The Los_loss on test set: {:.4f}".format(ll))

The MAP score on test set: 1.0000


In [95]:
prediction

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

## prediction ist schwachsinn, da nur 0 mit confidence 1 predicted wird

# MLP Classifier

In [96]:
mlp = MLPClassifier().fit(data_train, target_train)
prediction = mlp.predict_proba(data_test)
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))
# acc = accuracy_score(target_test,prediction) 
# f1= f1_score(target_test,prediction) 
# pr= precision_score(target_test,prediction) 
# re= recall_score(target_test,prediction) 
# ll=log_loss(target_test,prediction)
# print("The Accuracy on test set: {:.4f}".format(acc))
# print("The F1-Score on test set: {:.4f}".format(f1))
# print("The Precision-Score on test set: {:.4f}".format(pr))
# print("The Recall-Score on test set: {:.4f}".format(re))
# print("The Los_loss on test set: {:.4f}".format(ll))

The MAP score on test set: 0.3447


# Logistic Regression

In [97]:
lr = LogisticRegression(max_iter=100000).fit(data_train, target_train)
prediction = lr.predict_proba(data_test)
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))
# acc = accuracy_score(target_test,prediction) 
# f1= f1_score(target_test,prediction) 
# pr= precision_score(target_test,prediction) 
# re= recall_score(target_test,prediction) 
# ll=log_loss(target_test,prediction)
# print("The Accuracy on test set: {:.4f}".format(acc))
# print("The F1-Score on test set: {:.4f}".format(f1))
# print("The Precision-Score on test set: {:.4f}".format(pr))
# print("The Recall-Score on test set: {:.4f}".format(re))
# print("The Los_loss on test set: {:.4f}".format(ll))

The MAP score on test set: 0.3513


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_test,prediction)

array([[435957,  63943],
       [     7,     93]], dtype=int64)