In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score, f1_score, log_loss, roc_curve, roc_auc_score

In [None]:
# Lettura dei dataset iniziali ed inserimento in un array
campaign_data_basic = pd.read_csv('../data/campaign-data-training.csv')
campaign_data_undersampled = pd.read_csv('../data/campaign-data-undersampled.csv')
campaign_data_oversampled = pd.read_csv('../data/campaign-data-oversampled.csv')
campaign_data_resampled = pd.read_csv('../data/campaign-data-resampled.csv')

campaign_data_array = [
    
    campaign_data_basic, 
    campaign_data_undersampled,  
    campaign_data_oversampled,
    campaign_data_resampled 
]

# Testing set
testing_set = pd.read_csv('../data/campaign-data-testing.csv')
testing_set = testing_set.loc[:, testing_set.columns.isin(campaign_data_basic.columns)]

In [None]:
# Function which performs the k-fold cross validation over a training set and return the related results
def kfoldCrossValidation(training_set, n_folds):

    # Feature and label 'clicker'
    feature = training_set.drop('clicker', axis = 1).to_numpy()
    label = training_set['clicker'].to_numpy()

    # Stratified k-fold cross validation
    stratified_kfold = StratifiedKFold(n_splits = n_folds, random_state = 19, shuffle = True)

    results = {}

    # RANDOM FOREST CLASSIFICATION - K-FOLD CROSS VALIDATION (n_estimators = 100,200,...,700)
    for n_estimators in range(100, 501, 100):

        print(n_estimators)

        # Lists with evaluation metric
        f1_scores = []
    
        # Random Forest Classifier
        rf_classifier = RandomForestClassifier(
            criterion = 'entropy', n_estimators = n_estimators, 
            class_weight = 'balanced_subsample', random_state = 19
        )

        # Splitting in training and validation set
        for train, valid in stratified_kfold.split(feature, label):

            # Training
            rf_classifier.fit(feature[train], label[train])

            # Validation prediction
            valid_pred_class = rf_classifier.predict(feature[valid])
            
            valid_data = pd.DataFrame({

                'actual': label[valid].tolist(), 
                'pred_class': valid_pred_class
                
            })  
    
            # Performance metrics computation
            f1_scores.append(round(f1_score(valid_data['actual'], valid_data['pred_class']), 3))

        results[(str(n_estimators) + '_avg_f1_score')] = round(np.mean(np.array(f1_scores)), 3)

    return results

In [None]:
# Function which performs both final training and testing
def trainingAndTesting(n_estimators, training_set):

    # Feature and label 'clicker' for both training and testing set
    label_train = training_set['clicker'].to_numpy()
    feature_train = training_set.drop('clicker', axis = 1).to_numpy()
    label_test = testing_set['clicker'].to_numpy()
    feature_test = testing_set.drop('clicker', axis = 1).to_numpy()

    # Training
    rf_classifier = RandomForestClassifier(
        criterion = 'entropy', n_estimators = n_estimators, 
        class_weight = 'balanced_subsample', random_state = 19
    )
    rf_classifier.fit(feature_train, label_train)

    # Testing
    test_pred_prob = rf_classifier.predict_proba(feature_test)
    test_pred_class = rf_classifier.predict(feature_test)

    testing_data = pd.DataFrame({

        'actual': label_test.tolist(), 
        'pred_class': test_pred_class,
        'pred_prob_neg': test_pred_prob[:, 0],
        'pred_prob_pos': test_pred_prob[:, 1] 
        
    })  

    # Dataframe with only positive and negative class observations
    positive_class = testing_data[testing_data['actual'] == 1].copy()
    negative_class = testing_data[testing_data['actual'] == 0].copy()

    # Deciles of the predicted probabilities
    deciles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    positive_class['deciles'] = pd.cut(positive_class['pred_prob_pos'], deciles, include_lowest = True, labels = np.arange(1,11))
    negative_class['deciles'] = pd.cut(negative_class['pred_prob_neg'], deciles, include_lowest = True, labels = np.arange(1,11))

    # Deciles data with frequency and ratio of observation of positive class in the deciles
    deciles_data = pd.DataFrame({
        'pos': positive_class['deciles'].value_counts(sort = False),
        'neg': negative_class['deciles'].value_counts(sort = False)
    })

    deciles_data.reset_index(inplace = True)
    deciles_data['ratio_pos'] = deciles_data.apply(lambda row: round(row['pos'] / len(positive_class), 3), axis = 1)
    deciles_data['ratio_neg'] = deciles_data.apply(lambda row: round(row['neg'] / len(negative_class), 3), axis = 1)

    deciles_data.columns = ['decile', 'freq_pos', 'freq_neg', 'ratio_pos', 'ratio_neg']
    
    # Performance metrics computation 
    logloss = round(log_loss(y_true = testing_data['actual'], 
                             y_pred = np.column_stack((testing_data['pred_prob_neg'], testing_data['pred_prob_pos']))), 3)
    
    fp_ratio, tp_ratio, thresholds = roc_curve(testing_data['actual'], testing_data['pred_class'])
    roc_auc = roc_auc_score(testing_data['actual'], testing_data['pred_class'])
    
    return { 
        'recall': round(recall_score(testing_data['actual'], testing_data['pred_class']), 3),
        'f1_score': round(f1_score(testing_data['actual'], testing_data['pred_class']), 3),
        'deciles_data': deciles_data, 
        'log_loss': logloss,
        'ROC': [ fp_ratio, tp_ratio, thresholds, roc_auc ]
    }

In [None]:
campaign_data_results_kfold = []

for campaign_data in campaign_data_array:
    campaign_data_results_kfold.append(kfoldCrossValidation(campaign_data, 3))

In [None]:
# Risultati ottenuti con la k-fold
campaign_data_basic_results_kfold = campaign_data_results_kfold[0] # 100
campaign_data_undersampled_results_kfold = campaign_data_results_kfold[1] # 300
campaign_data_oversampled_results_kfold = campaign_data_results_kfold[2] # 100
campaign_data_resampled_results_kfold = campaign_data_results_kfold[3] # 300

In [None]:
f1_score_kfold = pd.DataFrame({

    'basic': campaign_data_basic_results_kfold['100_avg_f1_score'],
    'nm1': campaign_data_undersampled_results_kfold['300_avg_f1_score'],
    'smote': campaign_data_oversampled_results_kfold['100_avg_f1_score'],
    'smoteenn': campaign_data_resampled_results_kfold['300_avg_f1_score'],

}, index = ['f1_score_kfold'])

In [None]:
f1_score_kfold

In [4]:
basic_testing_result = trainingAndTesting(n_estimators = 100, training_set = campaign_data_array[0])
nm1_testing_result = trainingAndTesting(n_estimators = 300, training_set = campaign_data_array[1])
smote_testing_result = trainingAndTesting(n_estimators = 100, training_set = campaign_data_array[2])
smoteenn_testing_result = trainingAndTesting(n_estimators = 300, training_set = campaign_data_array[3])

In [5]:
# RECALL TESTING RESULTS
recall_testing = pd.DataFrame([

    { 'data': 'basic' , 'recall_testing': basic_testing_result['recall'] },
    { 'data': 'nm1' , 'recall_testing': nm1_testing_result['recall'] },
    { 'data': 'smote' , 'recall_testing': smote_testing_result['recall'] },
    { 'data': 'smoteenn' , 'recall_testing': smoteenn_testing_result['recall'] }

])

recall_testing

Unnamed: 0,data,recall_testing
0,basic,0.015
1,nm1,0.956
2,smote,0.029
3,smoteenn,0.132


In [6]:
# F1-SCORE TESTING RESULTS
f1_score_testing = pd.DataFrame([

    { 'data': 'basic' , 'f1_score_testing': basic_testing_result['f1_score'] },
    { 'data': 'nm1' , 'f1_score_testing': nm1_testing_result['f1_score'] },
    { 'data': 'smote' , 'f1_score_testing': smote_testing_result['f1_score'] },
    { 'data': 'smoteenn' , 'f1_score_testing': smoteenn_testing_result['f1_score'] }

])

f1_score_testing

Unnamed: 0,data,f1_score_testing
0,basic,0.002
1,nm1,0.009
2,smote,0.003
3,smoteenn,0.007


In [7]:
# LOG LOSS TESTING RESULTS
log_loss_testing = pd.DataFrame([

    { 'data': 'basic' , 'log_loss_testing': basic_testing_result['log_loss'] },
    { 'data': 'nm1' , 'log_loss_testing': nm1_testing_result['log_loss'] },
    { 'data': 'smote' , 'log_loss_testing': smote_testing_result['log_loss'] },
    { 'data': 'smoteenn' , 'log_loss_testing': smoteenn_testing_result['log_loss'] }

])

log_loss_testing

Unnamed: 0,data,log_loss_testing
0,basic,0.122
1,nm1,3.406
2,smote,0.187
3,smoteenn,1.206


In [8]:
# DECILES TESTING RESULTS
deciles_testing = pd.DataFrame({

    'decile': basic_testing_result['deciles_data']['decile'],

    'ratio_pos_basic': basic_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_basic': basic_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_nm1': nm1_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_nm1': nm1_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_smote': smote_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_smote': smote_testing_result['deciles_data']['ratio_neg'],

    'ratio_pos_smoteenn': smoteenn_testing_result['deciles_data']['ratio_pos'],
    'ratio_neg_smoteenn': smoteenn_testing_result['deciles_data']['ratio_neg']

})

deciles_testing

Unnamed: 0,decile,ratio_pos_basic,ratio_neg_basic,ratio_pos_nm1,ratio_neg_nm1,ratio_pos_smote,ratio_neg_smote,ratio_pos_smoteenn,ratio_neg_smoteenn
0,1,0.956,0.0,0.015,0.408,0.75,0.017,0.603,0.051
1,2,0.015,0.0,0.0,0.114,0.118,0.015,0.118,0.02
2,3,0.015,0.002,0.029,0.089,0.088,0.002,0.103,0.013
3,4,0.0,0.024,0.0,0.06,0.0,0.005,0.015,0.019
4,5,0.0,0.012,0.0,0.063,0.015,0.014,0.029,0.016
5,6,0.015,0.004,0.088,0.076,0.015,0.012,0.029,0.023
6,7,0.0,0.006,0.0,0.047,0.0,0.014,0.044,0.028
7,8,0.0,0.002,0.059,0.038,0.0,0.021,0.0,0.04
8,9,0.0,0.006,0.118,0.046,0.0,0.058,0.0,0.072
9,10,0.0,0.943,0.691,0.059,0.015,0.842,0.059,0.72


In [9]:
deciles_testing.tail(5).sum()

  deciles_testing.tail(5).sum()


ratio_pos_basic       0.015
ratio_neg_basic       0.961
ratio_pos_nm1         0.956
ratio_neg_nm1         0.266
ratio_pos_smote       0.030
ratio_neg_smote       0.947
ratio_pos_smoteenn    0.132
ratio_neg_smoteenn    0.883
dtype: float64

In [10]:
deciles_testing.to_csv('../data/deciles-testing-random-forest.csv', index = False)