Supervised_Retrieval_all_features.ipynb# Supervised Retrieval

In this notebook we use the supervised classification model for a supervised crosslingual information retrieval task.

In [1]:
import sys
import os
sys.path.append(os.path.dirname((os.path.abspath(''))))

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from src.models.predict_model import MAP_score, threshold_counts

## I. Import Data

In this section we import the feature dataframe for the retrieval task.

In [2]:
feature_dataframe=pd.read_feather("../data/processed/feature_model.feather")
feature_retrieval=pd.read_feather("../data/processed/feature_retrieval.feather")
feature_dataframe = feature_dataframe.rename(columns={"id_source": "source_id", "id_target": "target_id"})
feature_retrieval = feature_retrieval.rename(columns={"id_source": "source_id", "id_target": "target_id"})

#### Delete all columns with only one value

In [3]:
column_mask = feature_dataframe.apply(threshold_counts, threshold=1)
feature_dataframe = feature_dataframe.loc[:, column_mask]
feature_retrieval = feature_retrieval.loc[:, column_mask]

## II. Supervised Retrieval

#### Drop the target label and the indexes for training and testing

In [4]:
target_train=feature_dataframe['Translation'].astype(float)
data_train=feature_dataframe.drop(columns=['Translation','source_id','target_id'])
target_test=feature_retrieval['Translation'].astype(float)
data_test=feature_retrieval.drop(columns=['Translation','source_id','target_id'])

#### Z-Normalization

In [5]:
#scale data into [0,1]
scaler = preprocessing.StandardScaler()
data_train.loc[:, data_train.columns] = scaler.fit_transform(data_train.loc[:, data_train.columns])
data_test.loc[:, data_test.columns] = scaler.transform(data_test.loc[:, data_test.columns])

# Naive Bayes

In [6]:
nb = GaussianNB().fit(data_train, target_train)
prediction = nb.predict_proba(data_test)
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))

The MAP score on test set: 0.1398


# MLP Classifier

In [7]:
mlp = MLPClassifier(hidden_layer_sizes=5, verbose=True, early_stopping=True).fit(data_train, target_train)
prediction = mlp.predict_proba(data_test)
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))

Iteration 1, loss = 0.13300406
Validation score: 0.982318
Iteration 2, loss = 0.04756596
Validation score: 0.984955
Iteration 3, loss = 0.04022458
Validation score: 0.985909
Iteration 4, loss = 0.03652014
Validation score: 0.986727
Iteration 5, loss = 0.03466978
Validation score: 0.986636
Iteration 6, loss = 0.03362318
Validation score: 0.987000
Iteration 7, loss = 0.03313723
Validation score: 0.987182
Iteration 8, loss = 0.03263894
Validation score: 0.987136
Iteration 9, loss = 0.03239824
Validation score: 0.987727
Iteration 10, loss = 0.03200489
Validation score: 0.987818
Iteration 11, loss = 0.03207962
Validation score: 0.987091
Iteration 12, loss = 0.03177240
Validation score: 0.987636
Iteration 13, loss = 0.03176647
Validation score: 0.987136
Iteration 14, loss = 0.03154807
Validation score: 0.987045
Iteration 15, loss = 0.03157636
Validation score: 0.986773
Iteration 16, loss = 0.03155097
Validation score: 0.987591
Iteration 17, loss = 0.03140397
Validation score: 0.987773
Iterat

# Logistic Regression

In [8]:
lr = LogisticRegression(max_iter=100000, verbose=10, penalty="l2", C=0.0001).fit(data_train.to_numpy(), target_train.to_numpy())
prediction = lr.predict_proba(data_test.to_numpy())
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


The MAP score on test set: 0.8059


# XGBoost

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(data_train.to_numpy(), target_train.to_numpy())

prediction = model.predict_proba(data_test).tolist()
print("The MAP score on test set: {:.4f}".format(MAP_score(feature_retrieval['source_id'],target_test,prediction)))



