In [2]:
import pandas as pd
import scipy as sp
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import recall_score, f1_score, log_loss, roc_curve, roc_auc_score

In [3]:
""" 
Funzione che calcola la distanza di Mahalanobis tra ogni riga del dataframe X in input ed il dataframe data (distribuzione) 

X -> dataframe/osservazioni per le quali calcolare la distanza di Mahalanobis dalla distribuzione
data -> dataframe con la distribuzione di partenza
cov -> covariance matrix della distribuzione di partenza (if None, viene computata direttamente sulla distribuzione iniziale in input)

return -> la distanza di Mahalanobis delle osservazioni di X rispetto alla distribuzione data
"""
def computeMahalanobis(X = None, data = None, cov = None):
    
    # Calcolo della covariance matrix (regularization technique to handle singular covariance matrix) -> regularized Mahalanobis distance
    if(not cov):
        cov = np.cov(data.values.T) + 0.01 * np.eye(len(data.columns))
    
    # Mahalanobis Distance formula 
    return np.round(np.dot((np.dot((X - np.mean(data)), np.linalg.inv(cov))), (X - np.mean(data)).T).diagonal(), 3)

In [4]:
# Lettura dei dataset iniziali ed inserimento in un array
campaign_data_basic = pd.read_csv('../data/campaign-data-training.csv')
campaign_data_undersampled = pd.read_csv('../data/campaign-data-undersampled.csv')
campaign_data_oversampled = pd.read_csv('../data/campaign-data-oversampled.csv')
campaign_data_resampled = pd.read_csv('../data/campaign-data-resampled.csv')

campaign_data_array = [
    
    campaign_data_basic, 
    campaign_data_undersampled, 
    campaign_data_oversampled,
    campaign_data_resampled 
]

# Testing set
testing_set = pd.read_csv('../data/campaign-data-testing.csv')
testing_set = testing_set.loc[:, testing_set.columns.isin(campaign_data_basic.columns)]

In [5]:
# Classificatore binario basato sulla distanza di Mahalanobis
class MahalanobisBinaryClassifier():

    # Costruttore della classe
    def __init__(self, feature_train, label_train):

        self.feature_train_pos = feature_train.loc[label_train == 1, :]
        self.feature_train_neg = feature_train.loc[label_train == 0, :]

    # Funzione che restituisce le probabilità di appartenere alla classe negativa e positiva [prob_neg, prob_pos]
    def predict_proba(self, feature_test):

        mahal_neg_pos = [(mahal_neg, mahal_pos) for mahal_neg, mahal_pos in zip(computeMahalanobis(feature_test, self.feature_train_neg), computeMahalanobis(feature_test, self.feature_train_pos))]
        
        return np.array([(1 - mahal_neg/(mahal_pos+mahal_neg), 1 - mahal_pos/(mahal_pos+mahal_neg)) for mahal_neg, mahal_pos in mahal_neg_pos])

    # Funzione che restituisce la classe predetta dal modello
    def predict(self, feature_test):

        return np.array([np.argmax(row) for row in self.predict_proba(feature_test)])

In [6]:
# Function which performs both final training and testing
def trainingAndTesting(training_set):

    # Feature and label 'clicker' for both training and testing set
    label_train = training_set['clicker'].copy()
    label_test = testing_set['clicker'].copy()

    feature_train = training_set.drop('clicker', axis = 1).copy()
    feature_test = testing_set.drop('clicker', axis = 1).copy()

    feature_columns = feature_train.columns

    # Standardization 
    scaler = StandardScaler()
    scaler.fit(feature_train)

    feature_train = pd.DataFrame(scaler.transform(feature_train), columns = feature_columns)
    feature_test = pd.DataFrame(scaler.transform(feature_test), columns = feature_columns)

    # Training del classificatore Mahalanobis
    mahal_classifier = MahalanobisBinaryClassifier(feature_train, label_train) 

    # Testing
    pred_probs = mahal_classifier.predict_proba(feature_test)
    pred_class = mahal_classifier.predict(feature_test)

    testing_data = pd.DataFrame({

        'actual': label_test.tolist(), 
        'pred_class': pred_class,
        'pred_prob_neg': pred_probs[:, 0],
        'pred_prob_pos': pred_probs[:, 1] 
        
    })  

    # Dataframe with only positive and negative class observations
    positive_class = testing_data[testing_data['actual'] == 1].copy()
    negative_class = testing_data[testing_data['actual'] == 0].copy()

    # Deciles of the predicted probabilities
    deciles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    positive_class['deciles'] = pd.cut(positive_class['pred_prob_pos'], deciles, include_lowest = True, labels = np.arange(1,11))
    negative_class['deciles'] = pd.cut(negative_class['pred_prob_neg'], deciles, include_lowest = True, labels = np.arange(1,11))

    # Deciles data with frequency and ratio of observation of positive class in the deciles
    deciles_data = pd.DataFrame({
        'pos': positive_class['deciles'].value_counts(sort = False),
        'neg': negative_class['deciles'].value_counts(sort = False)
    })

    deciles_data.reset_index(inplace = True)
    deciles_data['ratio_pos'] = deciles_data.apply(lambda row: round(row['pos'] / len(positive_class), 3), axis = 1)
    deciles_data['ratio_neg'] = deciles_data.apply(lambda row: round(row['neg'] / len(negative_class), 3), axis = 1)

    deciles_data.columns = ['decile_interval', 'freq_pos', 'freq_neg', 'ratio_pos', 'ratio_neg']
    
    # Performance metrics computation 
    logloss = round(log_loss(y_true = testing_data['actual'], 
                             y_pred = np.column_stack((testing_data['pred_prob_neg'], testing_data['pred_prob_pos']))), 3)
    
    fp_ratio, tp_ratio, thresholds = roc_curve(testing_data['actual'], testing_data['pred_class'])
    roc_auc = roc_auc_score(testing_data['actual'], testing_data['pred_class'])
    
    return { 
        'recall': round(recall_score(testing_data['actual'], testing_data['pred_class']), 3),
        'f1_score': round(f1_score(testing_data['actual'], testing_data['pred_class']), 3),
        'deciles_data': deciles_data, 
        'log_loss': logloss,
        'ROC': [ fp_ratio, tp_ratio, thresholds, roc_auc ]
    }

In [7]:
basic_testing_result = trainingAndTesting(training_set = campaign_data_array[0])
nm1_testing_result = trainingAndTesting(training_set = campaign_data_array[1])
smote_testing_result = trainingAndTesting(training_set = campaign_data_array[2])
smoteenn_testing_result = trainingAndTesting(training_set = campaign_data_array[3])

In [8]:
# RECALL TESTING RESULTS
recall_testing = pd.DataFrame([

    { 'data': 'basic' , 'recall_testing': basic_testing_result['recall'] },
    { 'data': 'nm1' , 'recall_testing': nm1_testing_result['recall'] },
    { 'data': 'smote' , 'recall_testing': smote_testing_result['recall'] },
    { 'data': 'smoteenn' , 'recall_testing': smoteenn_testing_result['recall'] },

])

recall_testing

Unnamed: 0,data,recall_testing
0,basic,0.397
1,nm1,0.941
2,smote,0.074
3,smoteenn,0.147


In [9]:
# F1-SCORE TESTING RESULTS
f1_score_testing = pd.DataFrame([

    { 'data': 'basic' , 'f1_score_testing': basic_testing_result['f1_score'] },
    { 'data': 'nm1' , 'f1_score_testing': nm1_testing_result['f1_score'] },
    { 'data': 'smote' , 'f1_score_testing': smote_testing_result['f1_score'] },
    { 'data': 'smoteenn' , 'f1_score_testing': smoteenn_testing_result['f1_score'] },

])

f1_score_testing

Unnamed: 0,data,f1_score_testing
0,basic,0.009
1,nm1,0.007
2,smote,0.007
3,smoteenn,0.011


In [10]:
# LOG LOSS TESTING RESULTS
log_loss_testing = pd.DataFrame([

    { 'data': 'basic' , 'log_loss_testing': basic_testing_result['log_loss'] },
    { 'data': 'nm1' , 'log_loss_testing': nm1_testing_result['log_loss'] },
    { 'data': 'smote' , 'log_loss_testing': smote_testing_result['log_loss'] },
    { 'data': 'smoteenn' , 'log_loss_testing': smoteenn_testing_result['log_loss'] },

])

log_loss_testing

Unnamed: 0,data,log_loss_testing
0,basic,0.593
1,nm1,2.299
2,smote,0.372
3,smoteenn,0.374


In [11]:
# DECILES TESTING RESULTS
deciles_testing = pd.DataFrame({

    'decile_interval': basic_testing_result['deciles_data']['decile_interval'],

    'ratio_pos_basic': basic_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_basic': basic_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_nm1': nm1_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_nm1': nm1_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_smote': smote_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_smote': smote_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_smoteenn': smoteenn_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_smoteenn': smoteenn_testing_result['deciles_data']['ratio_neg'],

})

deciles_testing

Unnamed: 0,decile_interval,ratio_pos_basic,ratio_neg_basic,ratio_pos_nm1,ratio_neg_nm1,ratio_pos_smote,ratio_neg_smote,ratio_pos_smoteenn,ratio_neg_smoteenn
0,1,0.029,0.003,0.044,0.459,0.059,0.002,0.059,0.002
1,2,0.029,0.014,0.0,0.167,0.132,0.004,0.132,0.004
2,3,0.015,0.013,0.0,0.102,0.206,0.016,0.176,0.016
3,4,0.044,0.021,0.0,0.083,0.294,0.014,0.294,0.014
4,5,0.485,0.222,0.015,0.05,0.235,0.028,0.191,0.051
5,6,0.324,0.438,0.044,0.031,0.0,0.102,0.074,0.098
6,7,0.059,0.119,0.029,0.009,0.029,0.288,0.029,0.255
7,8,0.015,0.041,0.029,0.01,0.044,0.261,0.044,0.268
8,9,0.0,0.031,0.088,0.012,0.0,0.167,0.0,0.169
9,10,0.0,0.098,0.75,0.076,0.0,0.118,0.0,0.124


In [12]:
deciles_testing.tail(5).sum()

ratio_pos_basic       0.398
ratio_neg_basic       0.727
ratio_pos_nm1         0.940
ratio_neg_nm1         0.138
ratio_pos_smote       0.073
ratio_neg_smote       0.936
ratio_pos_smoteenn    0.147
ratio_neg_smoteenn    0.914
dtype: float64

In [13]:
deciles_testing.to_csv('../data/deciles-testing-mahalanobis.csv', index = False)