# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [1]:
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score, threshold_counts, feature_selection, pipeline_model_optimization

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_feather("../data/processed/feature_model_en_de.feather")
feature_retrieval=pd.read_feather("../data/processed/feature_retrieval_en_de.feather")
feature_dataframe = feature_dataframe.rename(columns={"id_source": "source_id", "id_target": "target_id"})
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_pl = pd.read_feather("../data/processed/feature_retrieval_en_pl.feather")
feature_retrieval_pl = feature_retrieval_pl.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_it = pd.read_feather("../data/processed/feature_retrieval_en_it.feather")
feature_retrieval_it = feature_retrieval_it.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_doc = pd.read_feather("../data/processed/feature_retrieval_doc.feather")
feature_retrieval_doc = feature_retrieval_doc.rename(columns={"id_source": "source_id", "id_target": "target_id"})

#### Delete all columns with only one value

In [3]:
column_mask = feature_dataframe.apply(threshold_counts, threshold=1)
feature_dataframe = feature_dataframe.loc[:, column_mask]
feature_retrieval = feature_retrieval.loc[:, column_mask]


## II. Supervised Retrieval

#### Start with one feature

In [11]:
start_features = ["jaccard_translation_proc_5k"]
not_add = ["Translation", "source_id", "target_id"]
added_features = feature_dataframe.columns[~feature_dataframe.columns.isin(start_features+not_add)]

# Naive Bayes

In [12]:
nb = GaussianNB()
scaler = preprocessing.StandardScaler()

nb_parameter_grid = {}

nb_best_features, nb_best_parameter_combination, nb_best_map_score, nb_all_parameter_combination = \
pipeline_model_optimization(nb, nb_parameter_grid, scaler, feature_dataframe, 
                            feature_retrieval, start_features, 
                            added_features, 
                            threshold_map_feature_selection=0.001)

-----------------First do Forward Selection-----------------

Current Iteration through feature list: 1
The initial MAP score on test set: 0.7535
Updated MAP score on test set with new feature jaccard_translation_vecmap: 0.7810
Updated MAP score on test set with new feature cosine_similarity_tf_idf_vecmap: 0.7916
Updated MAP score on test set with new feature cosine_similarity_average_vecmap: 0.7992
Updated MAP score on test set with new feature number_VERB_difference_relative: 0.8035
Updated MAP score on test set with new feature number_ADJ_difference_normalized: 0.8046
Updated MAP score on test set with new feature number_characters_difference_relative: 0.8248
Updated MAP score on test set with new feature number_-_difference_normalized: 0.8274
Updated MAP score on test set with new feature number_-_difference: 0.8324

Current Iteration through feature list: 2
The initial MAP score on test set: 0.8324
Updated MAP score on test set with new feature cosine_similarity_tf_idf_proc_b_1k: 

In [10]:
nb_best_features = ["jaccard_translation_proc_5k",
                   "jaccard_translation_vecmap",
                   "cosine_similarity_tf_idf_vecmap",
                   "cosine_similarity_average_vecmap",
                   "number_VERB_difference_relative",
                   "number_ADJ_difference_normalized",
                   "number_characters_difference_relative",
                   "number_-_difference_normalized",
                   "number_-_difference",
                   "cosine_similarity_tf_idf_proc_b_1k",
                   "number_NOUN_difference_relative",
                   "number_?_difference_normalized",
                   "number_,_difference_normalized"]

In [11]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
data_train = data_train.loc[:, nb_best_features]
scaler = preprocessing.StandardScaler()
data_train.loc[:, data_train.columns] = scaler.fit_transform(data_train.loc[:, data_train.columns])

print("Model was trained on EN-DE Parallel Sentences.\n")
nb = GaussianNB().fit(data_train, target_train)

# EN-DE
target_test = feature_retrieval['Translation'].astype(float)
data_test = feature_retrieval.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-DE Map Score: {}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))

# EN-IT
target_test = feature_retrieval_it['Translation'].astype(float)
data_test = feature_retrieval_it.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-IT Map Score: {}".format(MAP_score(feature_retrieval_it['source_id'],target_test,prediction)))

# EN-PL
target_test = feature_retrieval_pl['Translation'].astype(float)
data_test = feature_retrieval_pl.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-PL Map Score: {}".format(MAP_score(feature_retrieval_pl['source_id'],target_test,prediction)))

# EN-PL
target_test = feature_retrieval_doc['Translation'].astype(float)
data_test = feature_retrieval_doc.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("Document Corpus Map Score: {}".format(MAP_score(feature_retrieval_doc['source_id'],target_test,prediction)))

Model was trained on EN-DE Parallel Sentences.

EN-DE Map Score: 0.8422674550834229
EN-IT Map Score: 0.8070125148698435
EN-PL Map Score: 0.8608254573694002
Document Corpus Map Score: 0.00032160628203609245
