# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [16]:
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score, threshold_counts, feature_selection, pipeline_model_optimization

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [17]:
feature_dataframe=pd.read_feather("../data/processed/feature_model_en_de.feather")
feature_retrieval=pd.read_feather("../data/processed/feature_retrieval_en_de.feather")
feature_dataframe = feature_dataframe.rename(columns={"id_source": "source_id", "id_target": "target_id"})
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

# Load Test Data
feature_retrieval_de = pd.read_feather("../data/processed/feature_retrieval_en_de_testset.feather")
feature_retrieval_de = feature_retrieval_de.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_pl = pd.read_feather("../data/processed/feature_retrieval_en_pl.feather")
feature_retrieval_pl = feature_retrieval_pl.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_it = pd.read_feather("../data/processed/feature_retrieval_en_it.feather")
feature_retrieval_it = feature_retrieval_it.rename(columns={"id_source": "source_id", "id_target": "target_id"})

feature_retrieval_doc = pd.read_feather("../data/processed/feature_retrieval_doc.feather")
feature_retrieval_doc = feature_retrieval_doc.rename(columns={"id_source": "source_id", "id_target": "target_id"})

#### Delete all columns with only one value

In [7]:
column_mask = feature_dataframe.apply(threshold_counts, threshold=1)
feature_dataframe = feature_dataframe.loc[:, column_mask]
feature_retrieval = feature_retrieval.loc[:, column_mask]


## II. Supervised Retrieval

#### Start with one feature

In [25]:
start_features = ["jaccard_translation_proc_5k"]
# We saw that Naive Bayes only gives importante to the "/" feature - we remove them
not_add = ["Translation", "source_id", "target_id", "number_/_difference_normalized", "number_/_difference", "number_/_difference_relative"]
added_features = feature_dataframe.columns[~feature_dataframe.columns.isin(start_features+not_add)]

# Naive Bayes

In [26]:
nb = GaussianNB()
scaler = preprocessing.StandardScaler()

nb_parameter_grid = {}

nb_best_features, nb_best_parameter_combination, nb_best_map_score, nb_all_parameter_combination = \
pipeline_model_optimization(nb, nb_parameter_grid, scaler, feature_dataframe, 
                            feature_retrieval, start_features, 
                            added_features, 
                            threshold_map_feature_selection=0.001)

-----------------First do Forward Selection-----------------

Current Iteration through feature list: 1
The initial MAP score on test set: 0.7171
Updated MAP score on test set with new feature cosine_similarity_tf_idf_vecmap: 0.7548
Updated MAP score on test set with new feature cosine_similarity_average_vecmap: 0.7666
Updated MAP score on test set with new feature number__difference_normalized: 0.7699
Updated MAP score on test set with new feature number_NOUN_difference_normalized: 0.7719
Updated MAP score on test set with new feature number_characters_difference_relative: 0.7847
Updated MAP score on test set with new feature number_:_difference_normalized: 0.7873
Updated MAP score on test set with new feature number_._difference_normalized: 0.7933
Updated MAP score on test set with new feature number_-_difference_normalized: 0.8009
Updated MAP score on test set with new feature number_)_difference_normalized: 0.8045

Current Iteration through feature list: 2
The initial MAP score on 

In [27]:
nb_best_features

['jaccard_translation_proc_5k',
 'cosine_similarity_tf_idf_vecmap',
 'cosine_similarity_average_vecmap',
 'number__difference_normalized',
 'number_NOUN_difference_normalized',
 'number_characters_difference_relative',
 'number_:_difference_normalized',
 'number_._difference_normalized',
 'number_-_difference_normalized',
 'number_)_difference_normalized',
 'euclidean_distance_average_proc_5k',
 'number_VERB_difference_normalized']

In [None]:
nb_best_features = ['jaccard_translation_proc_5k',
 'cosine_similarity_tf_idf_vecmap',
 'cosine_similarity_average_vecmap',
 'number_NOUN_difference_normalized',
 'number_characters_difference_relative',
 'number_:_difference_normalized',
 'number_/_difference_normalized',
 'number_/_difference_relative',
 'number_._difference_normalized',
 'number_-_difference_normalized',
 'number_)_difference_normalized',
 'number_characters_difference_normalized',
 'number_/_difference',
 'number_VERB_difference_relative',
 'number_%_difference_normalized']

# Evaluate on Test Set

In [28]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
data_train = data_train.loc[:, nb_best_features]
scaler = preprocessing.StandardScaler()
data_train.loc[:, data_train.columns] = scaler.fit_transform(data_train.loc[:, data_train.columns])

print("Model was trained on EN-DE Parallel Sentences.\n")
nb = GaussianNB().fit(data_train, target_train)

# EN-DE
target_test = feature_retrieval_de['Translation'].astype(float)
data_test = feature_retrieval_de.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-DE Map Score: {}".format(MAP_score(feature_retrieval_de['source_id'],target_test,prediction)))

# EN-IT
target_test = feature_retrieval_it['Translation'].astype(float)
data_test = feature_retrieval_it.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-IT Map Score: {}".format(MAP_score(feature_retrieval_it['source_id'],target_test,prediction)))

# EN-PL
target_test = feature_retrieval_pl['Translation'].astype(float)
data_test = feature_retrieval_pl.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("EN-PL Map Score: {}".format(MAP_score(feature_retrieval_pl['source_id'],target_test,prediction)))

# Document Corpus
target_test = feature_retrieval_doc['Translation'].astype(float)
data_test = feature_retrieval_doc.drop(columns=['Translation','source_id','target_id'])
data_test = data_test.loc[:, nb_best_features]
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])
prediction = nb.predict_proba(data_test).tolist()
print("Document Corpus Map Score: {}".format(MAP_score(feature_retrieval_doc['source_id'],target_test,prediction)))

Model was trained on EN-DE Parallel Sentences.

EN-DE Map Score: 0.8127527956701142
EN-IT Map Score: 0.794724564323062
EN-PL Map Score: 0.8242123704903378
Document Corpus Map Score: 0.0003451649390702295


# Save Model

In [15]:
import pickle

filename = "../models/supervised_models/finalized_model_naive_bayes.sav"
pickle.dump(nb, open(filename, 'wb'))