# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [1]:
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score, threshold_counts, feature_selection, pipeline_model_optimization

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_feather("../data/processed/feature_model.feather")
feature_retrieval=pd.read_feather("../data/processed/feature_retrieval.feather")
feature_dataframe = feature_dataframe.rename(columns={"id_source": "source_id", "id_target": "target_id"})
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

#### Delete all columns with only one value

In [3]:
column_mask = feature_dataframe.apply(threshold_counts, threshold=1)
feature_dataframe = feature_dataframe.loc[:, column_mask]
feature_retrieval = feature_retrieval.loc[:, column_mask]


## II. Supervised Retrieval

#### Start with one feature

In [4]:
start_features = ["jaccard_translation_proc_5k"]
not_add = ["Translation", "source_id", "target_id"]
added_features = feature_dataframe.columns[~feature_dataframe.columns.isin(start_features+not_add)]

# Naive Bayes

In [None]:
nb = GaussianNB()
scaler = preprocessing.StandardScaler()

nb_parameter_grid = {}

nb_best_features, nb_best_parameter_combination, nb_best_map_score, nb_all_parameter_combination = \
pipeline_model_optimization(nb, nb_parameter_grid, scaler, feature_dataframe, 
                            feature_retrieval, start_features, 
                            added_features, 
                            threshold_map_feature_selection=0.001)

-----------------First do Forward Selection-----------------

Current Iteration through feature list: 1
The initial MAP score on test set: 0.7193
Updated MAP score on test set with new feature number_VERB_difference: 0.7253
Updated MAP score on test set with new feature number_NOUN_difference_relative: 0.7359
Updated MAP score on test set with new feature number_NOUN_difference: 0.7398
Updated MAP score on test set with new feature number_ADJ_difference_relative: 0.7429
Updated MAP score on test set with new feature characters_avg_difference_relative: 0.7456
Updated MAP score on test set with new feature number_characters_difference_relative: 0.7487
Updated MAP score on test set with new feature number_]_difference_relative: 0.7521
Updated MAP score on test set with new feature number_:_difference_relative: 0.7545
Updated MAP score on test set with new feature number_-_difference_normalized: 0.7638
Updated MAP score on test set with new feature number_%_difference_normalized: 0.7668

C

# XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(verbosity = 0, use_label_encoder=False)
scaler = preprocessing.StandardScaler()

xgb_parameter_grid = {"verbosity": [0],
                     "use_label_encoder": [False],
                     "learning_rate"    : [0.20, 0.25] ,
                     "max_depth"        : [15],
                     "min_child_weight" : [7 ],
                     "gamma"            : [ 0.4 ],
                     "colsample_bytree" : [0.7 ]}

xgb_best_features, xgb_best_parameter_combination, xgb_best_map_score, xgb_all_parameter_combination = \
pipeline_model_optimization(xgb, xgb_parameter_grid, scaler, feature_dataframe, 
                            feature_retrieval, start_features, 
                            added_features, 
                            threshold_map_feature_selection=0.001)

In [None]:
parameter_grid = {"verbosity": [0],
                 "use_label_encoder": [False],
                 "learning_rate"    : [0.20, 0.25] ,
                 "max_depth"        : [15],
                 "min_child_weight" : [7 ],
                 "gamma"            : [ 0.4 ],
                 "colsample_bytree" : [0.7 ]}

best_features, best_parameter_combination, best_map_score, all_parameter_combination = \
pipeline_model_optimization(xgb, parameter_grid, scaler, feature_dataframe, 
                            feature_retrieval, start_features, 
                            added_features, 
                            threshold_map_feature_selection=0.01)