# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [8]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score, threshold_counts,feature_selection

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_feather("../data/processed/feature_model.feather")
feature_retrieval=pd.read_feather("../data/processed/feature_retrieval.feather")
feature_dataframe = feature_dataframe.rename(columns={"id_source": "source_id", "id_target": "target_id"})
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

#### Delete all columns with only one value

In [3]:
column_mask = feature_dataframe.apply(threshold_counts, threshold=1)
feature_dataframe = feature_dataframe.loc[:, column_mask]
feature_retrieval = feature_retrieval.loc[:, column_mask]

## II. Supervised Retrieval

#### Drop the target label and the indexes for training and testing

In [4]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
target_test=feature_retrieval['Translation'].astype(float)
data_test=feature_retrieval.drop(columns=['Translation','source_id','target_id'])

#### Z-Normalization

In [5]:
#scale data into [0,1]
scaler = preprocessing.StandardScaler()
data_train.loc[:, data_train.columns] = scaler.fit_transform(data_train.loc[:, data_train.columns])
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])

#### Get features for forward selection based on MAP score -> start with most important feature

In [53]:
keep_columns=['jaccard_translation_proc_5k']
features=['jaccard_numbers_source',
 'cosine_similarity_average_proc_5k',
 'cosine_similarity_tf_idf_proc_5k',
 'euclidean_distance_average_proc_5k',
 'euclidean_distance_tf_idf_proc_5k',
 'cosine_similarity_average_proc_b_1k',
 'cosine_similarity_tf_idf_proc_b_1k',
 'euclidean_distance_average_proc_b_1k',
 'euclidean_distance_tf_idf_proc_b_1k',
 'jaccard_translation_proc_b_1k',
 'cosine_similarity_average_vecmap',
 'cosine_similarity_tf_idf_vecmap',
 'euclidean_distance_average_vecmap',
 'euclidean_distance_tf_idf_vecmap',
 'jaccard_translation_vecmap', 'number_punctuations_total_difference',
 'number_punctuations_total_difference_relative',
 'number_punctuations_total_difference_normalized',
 'number_words_difference',
 'number_words_difference_relative',
 'number_words_difference_normalized',
 'number_unique_words_difference',
 'number_unique_words_difference_relative',
 'number_unique_words_difference_normalized',
 'number_!_difference',
 'number_!_difference_relative',
 'number_!_difference_normalized',
 'number_#_difference',
 'number_#_difference_relative',
 'number_#_difference_normalized',
 'number_$_difference',
 'number_$_difference_relative',
 'number_$_difference_normalized',
 'number_%_difference',
 'number_%_difference_relative',
 'number_%_difference_normalized',
 'number_&_difference',
 'number_&_difference_relative',
 'number_&_difference_normalized',
 "number_'_difference",
 "number_'_difference_relative",
 "number_'_difference_normalized",
 'number_(_difference',
 'number_(_difference_relative',
 'number_(_difference_normalized',
 'number_)_difference',
 'number_)_difference_relative',
 'number_)_difference_normalized',
 'number_+_difference',
 'number_+_difference_relative',
 'number_+_difference_normalized',
 'number_,_difference',
 'number_,_difference_relative',
 'number_,_difference_normalized',
 'number_-_difference',
 'number_-_difference_relative',
 'number_-_difference_normalized',
 'number_._difference',
 'number_._difference_relative',
 'number_._difference_normalized',
 'number_/_difference',
 'number_/_difference_relative',
 'number_/_difference_normalized',
 'number_:_difference',
 'number_:_difference_relative',
 'number_:_difference_normalized',
 'number_;_difference',
 'number_;_difference_relative',
 'number_;_difference_normalized',
 'number_?_difference',
 'number_?_difference_relative',
 'number_?_difference_normalized',
 'number_[_difference',
 'number_[_difference_relative',
 'number_[_difference_normalized',
 'number_]_difference',
 'number_]_difference_relative',
 'number_]_difference_normalized',
 'number_characters_difference',
 'number_characters_difference_relative',
 'number_characters_difference_normalized',
 'characters_avg_difference',
 'characters_avg_difference_relative',
 'characters_avg_difference_normalized',
 'number_ADJ_difference',
 'number_ADJ_difference_relative',
 'number_ADJ_difference_normalized',
 'number_NOUN_difference',
 'number_NOUN_difference_relative',
 'number_NOUN_difference_normalized',
 'number_VERB_difference',
 'number_VERB_difference_relative',
 'number_VERB_difference_normalized',
 'score_polarity_difference',
 'score_polarity_difference_relative',
 'score_polarity_difference_normalized',
 'score_subjectivity_difference',
 'score_subjectivity_difference_relative',
 'score_subjectivity_difference_normalized']

# MLP Classifier

In [42]:
# mlp = MLPClassifier(hidden_layer_sizes=5, verbose=True, early_stopping=True).fit(data_train, target_train)
# prediction = mlp.predict_proba(data_test)
# print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))
scaler = preprocessing.StandardScaler()
model = MLPClassifier(hidden_layer_sizes=5,random_state=42,  early_stopping=True)
feature_selection(model,scaler,feature_dataframe,feature_retrieval,keep_columns,features)

The initial MAP score on test set: 0.8244
With number_punctuations_total_difference added, the MAP score on test set: 0.8244
With number_punctuations_total_difference_relative added, the MAP score on test set: 0.8244
With number_punctuations_total_difference_normalized added, the MAP score on test set: 0.8244
With number_words_difference added, the MAP score on test set: 0.8244
With number_words_difference_relative added, the MAP score on test set: 0.7804
With number_words_difference_normalized added, the MAP score on test set: 0.7965
With number_unique_words_difference added, the MAP score on test set: 0.8177
With number_unique_words_difference_relative added, the MAP score on test set: 0.7962
With number_unique_words_difference_normalized added, the MAP score on test set: 0.7929
With number_!_difference added, the MAP score on test set: 0.7920
With number_!_difference_relative added, the MAP score on test set: 0.7919
With number_!_difference_normalized added, the MAP score on test se

In [40]:
print(keep_columns)
data_train = feature_dataframe.filter(items=keep_columns)
data_test = feature_retrieval.filter(items=keep_columns)
# scale the features
data_train[data_train.columns] = scaler.fit_transform(data_train[data_train.columns])
data_test[data_test.columns] = scaler.transform(data_test[data_test.columns])
# fit the model and get the initial MapScore
modelfit = model.fit(data_train, target_train)
prediction = modelfit.predict_proba(data_test)
MapScore = MAP_score(feature_retrieval['source_id'], target_test, prediction)
print("The final MAP score on test set: {:.4f}".format(MapScore))

['jaccard_numbers_source', 'cosine_similarity_average_proc_5k', 'cosine_similarity_tf_idf_proc_5k', 'euclidean_distance_average_proc_5k', 'euclidean_distance_tf_idf_proc_5k', 'jaccard_translation_proc_5k', 'cosine_similarity_average_proc_b_1k', 'cosine_similarity_tf_idf_proc_b_1k', 'euclidean_distance_average_proc_b_1k', 'euclidean_distance_tf_idf_proc_b_1k', 'jaccard_translation_proc_b_1k', 'cosine_similarity_average_vecmap', 'cosine_similarity_tf_idf_vecmap', 'euclidean_distance_average_vecmap', 'euclidean_distance_tf_idf_vecmap', 'jaccard_translation_vecmap', 'number_punctuations_total_difference', 'number_punctuations_total_difference_normalized', 'number_punctuations_total_difference_relative', 'number_words_difference', 'number_+_difference', 'number_-_difference']
The final MAP score on test set: 0.8244


In [41]:
len(['jaccard_numbers_source', 'cosine_similarity_average_proc_5k', 'cosine_similarity_tf_idf_proc_5k', 'euclidean_distance_average_proc_5k', 'euclidean_distance_tf_idf_proc_5k', 'jaccard_translation_proc_5k', 'cosine_similarity_average_proc_b_1k', 'cosine_similarity_tf_idf_proc_b_1k', 'euclidean_distance_average_proc_b_1k', 'euclidean_distance_tf_idf_proc_b_1k', 'jaccard_translation_proc_b_1k', 'cosine_similarity_average_vecmap', 'cosine_similarity_tf_idf_vecmap', 'euclidean_distance_average_vecmap', 'euclidean_distance_tf_idf_vecmap', 'jaccard_translation_vecmap', 'number_punctuations_total_difference', 'number_punctuations_total_difference_normalized', 'number_punctuations_total_difference_relative', 'number_words_difference', 'number_+_difference', 'number_-_difference'])

22

# Logistic Regression

In [45]:
keep_columns=['jaccard_translation_proc_5k']
features=['jaccard_numbers_source',
 'cosine_similarity_average_proc_5k',
 'cosine_similarity_tf_idf_proc_5k',
 'euclidean_distance_average_proc_5k',
 'euclidean_distance_tf_idf_proc_5k',
 'cosine_similarity_average_proc_b_1k',
 'cosine_similarity_tf_idf_proc_b_1k',
 'euclidean_distance_average_proc_b_1k',
 'euclidean_distance_tf_idf_proc_b_1k',
 'jaccard_translation_proc_b_1k',
 'cosine_similarity_average_vecmap',
 'cosine_similarity_tf_idf_vecmap',
 'euclidean_distance_average_vecmap',
 'euclidean_distance_tf_idf_vecmap',
 'jaccard_translation_vecmap', 'number_punctuations_total_difference',
 'number_punctuations_total_difference_relative',
 'number_punctuations_total_difference_normalized',
 'number_words_difference',
 'number_words_difference_relative',
 'number_words_difference_normalized',
 'number_unique_words_difference',
 'number_unique_words_difference_relative',
 'number_unique_words_difference_normalized',
 'number_!_difference',
 'number_!_difference_relative',
 'number_!_difference_normalized',
 'number_#_difference',
 'number_#_difference_relative',
 'number_#_difference_normalized',
 'number_$_difference',
 'number_$_difference_relative',
 'number_$_difference_normalized',
 'number_%_difference',
 'number_%_difference_relative',
 'number_%_difference_normalized',
 'number_&_difference',
 'number_&_difference_relative',
 'number_&_difference_normalized',
 "number_'_difference",
 "number_'_difference_relative",
 "number_'_difference_normalized",
 'number_(_difference',
 'number_(_difference_relative',
 'number_(_difference_normalized',
 'number_)_difference',
 'number_)_difference_relative',
 'number_)_difference_normalized',
 'number_+_difference',
 'number_+_difference_relative',
 'number_+_difference_normalized',
 'number_,_difference',
 'number_,_difference_relative',
 'number_,_difference_normalized',
 'number_-_difference',
 'number_-_difference_relative',
 'number_-_difference_normalized',
 'number_._difference',
 'number_._difference_relative',
 'number_._difference_normalized',
 'number_/_difference',
 'number_/_difference_relative',
 'number_/_difference_normalized',
 'number_:_difference',
 'number_:_difference_relative',
 'number_:_difference_normalized',
 'number_;_difference',
 'number_;_difference_relative',
 'number_;_difference_normalized',
 'number_?_difference',
 'number_?_difference_relative',
 'number_?_difference_normalized',
 'number_[_difference',
 'number_[_difference_relative',
 'number_[_difference_normalized',
 'number_]_difference',
 'number_]_difference_relative',
 'number_]_difference_normalized',
 'number_characters_difference',
 'number_characters_difference_relative',
 'number_characters_difference_normalized',
 'characters_avg_difference',
 'characters_avg_difference_relative',
 'characters_avg_difference_normalized',
 'number_ADJ_difference',
 'number_ADJ_difference_relative',
 'number_ADJ_difference_normalized',
 'number_NOUN_difference',
 'number_NOUN_difference_relative',
 'number_NOUN_difference_normalized',
 'number_VERB_difference',
 'number_VERB_difference_relative',
 'number_VERB_difference_normalized',
 'score_polarity_difference',
 'score_polarity_difference_relative',
 'score_polarity_difference_normalized',
 'score_subjectivity_difference',
 'score_subjectivity_difference_relative',
 'score_subjectivity_difference_normalized']

In [48]:
# lr = LogisticRegression(max_iter=100000, verbose=10, penalty="l2", C=0.0001).fit(data_train.to_numpy(), target_train.to_numpy())
# prediction = lr.predict_proba(data_test.to_numpy())
# print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))
scaler = preprocessing.StandardScaler()
model = LogisticRegression(max_iter=100000,penalty="l2", C=0.0001)
feature_selection(model,scaler,feature_dataframe,feature_retrieval,keep_columns,features)

The initial MAP score on test set: 0.8369
With jaccard_numbers_source added, the MAP score on test set: 0.8369
With cosine_similarity_average_proc_5k added, the MAP score on test set: 0.8369
With cosine_similarity_tf_idf_proc_5k added, the MAP score on test set: 0.8369
With euclidean_distance_average_proc_5k added, the MAP score on test set: 0.8369
With euclidean_distance_tf_idf_proc_5k added, the MAP score on test set: 0.8369
Updated MAP score on test set with new feature euclidean_distance_tf_idf_proc_5k: 0.8369
With cosine_similarity_average_proc_b_1k added, the MAP score on test set: 0.8369
With cosine_similarity_tf_idf_proc_b_1k added, the MAP score on test set: 0.8369
With euclidean_distance_average_proc_b_1k added, the MAP score on test set: 0.8370
Updated MAP score on test set with new feature euclidean_distance_average_proc_b_1k: 0.8370
With euclidean_distance_tf_idf_proc_b_1k added, the MAP score on test set: 0.8370
With jaccard_translation_proc_b_1k added, the MAP score on t

With score_polarity_difference_relative added, the MAP score on test set: 0.8449
With score_polarity_difference_normalized added, the MAP score on test set: 0.8063
With score_subjectivity_difference added, the MAP score on test set: 0.8449
With score_subjectivity_difference_relative added, the MAP score on test set: 0.8449
Updated MAP score on test set with new feature score_subjectivity_difference_relative: 0.8449
With score_subjectivity_difference_normalized added, the MAP score on test set: 0.8449


In [49]:
print(keep_columns)
print(len(keep_columns))
data_train = feature_dataframe.filter(items=keep_columns)
data_test = feature_retrieval.filter(items=keep_columns)
# scale the features
data_train[data_train.columns] = scaler.fit_transform(data_train[data_train.columns])
data_test[data_test.columns] = scaler.transform(data_test[data_test.columns])
# fit the model and get the initial MapScore
modelfit = model.fit(data_train, target_train)
prediction = modelfit.predict_proba(data_test)
MapScore = MAP_score(feature_retrieval['source_id'], target_test, prediction)
print("The final MAP score on test set: {:.4f}".format(MapScore))

['jaccard_translation_proc_5k', 'jaccard_numbers_source', 'cosine_similarity_average_proc_5k', 'cosine_similarity_tf_idf_proc_5k', 'cosine_similarity_average_proc_b_1k', 'cosine_similarity_tf_idf_proc_b_1k', 'euclidean_distance_tf_idf_proc_b_1k', 'euclidean_distance_average_vecmap', 'euclidean_distance_tf_idf_vecmap', 'number_words_difference', 'number_!_difference_normalized', 'number_&_difference', 'number_-_difference', 'number_-_difference_relative', 'number_-_difference_normalized', 'number_._difference_relative', 'number_:_difference', 'number_:_difference_relative', 'number_:_difference_normalized', 'number_;_difference', 'number_;_difference_relative', 'number_?_difference', 'number_?_difference_relative', 'number_?_difference_normalized', 'characters_avg_difference_relative', 'score_subjectivity_difference', 'score_subjectivity_difference_normalized', 'euclidean_distance_tf_idf_proc_5k', 'euclidean_distance_average_proc_b_1k', 'cosine_similarity_average_vecmap', 'cosine_simila

In [51]:
final_set_LR=['jaccard_translation_proc_5k', 'jaccard_numbers_source', 'cosine_similarity_average_proc_5k', 'cosine_similarity_tf_idf_proc_5k', 'cosine_similarity_average_proc_b_1k', 'cosine_similarity_tf_idf_proc_b_1k', 'euclidean_distance_tf_idf_proc_b_1k', 'euclidean_distance_average_vecmap', 'euclidean_distance_tf_idf_vecmap', 'number_words_difference', 'number_!_difference_normalized', 'number_&_difference', 'number_-_difference', 'number_-_difference_relative', 'number_-_difference_normalized', 'number_._difference_relative', 'number_:_difference', 'number_:_difference_relative', 'number_:_difference_normalized', 'number_;_difference', 'number_;_difference_relative', 'number_?_difference', 'number_?_difference_relative', 'number_?_difference_normalized', 'characters_avg_difference_relative', 'score_subjectivity_difference', 'score_subjectivity_difference_normalized', 'euclidean_distance_tf_idf_proc_5k', 'euclidean_distance_average_proc_b_1k', 'cosine_similarity_average_vecmap', 'cosine_similarity_tf_idf_vecmap', 'jaccard_translation_vecmap', 'number_%_difference', 'number_%_difference_relative', "number_'_difference", "number_'_difference_relative", 'number_+_difference', 'number_[_difference', 'number_[_difference_relative', 'number_ADJ_difference_normalized', 'score_subjectivity_difference_relative']

In [61]:
np.arange(10, 15)

array([10, 11, 12, 13, 14])