In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NearMiss

In [2]:
campaign_data_training = pd.read_csv('../data/campaign-data-training.csv')

# Dataframe con osservazioni di classe positiva e negativa separati
neg_class = campaign_data_training[campaign_data_training['clicker'] == 0]
pos_class = campaign_data_training[campaign_data_training['clicker'] == 1]

# Feature and label 'clicker' (GLOBAL)
feature = campaign_data_training.drop('clicker', axis = 1)
label = campaign_data_training['clicker'].apply(str)

# Feature and label 'clicker' (NEGATIVE CLASS)
feature_neg_class = neg_class.drop('clicker', axis = 1, inplace = False)
label_neg_class = neg_class['clicker']

In [3]:
# OVERSAMPLING + UNDERSAMPLING

# SMOTEENN
smoteenn = SMOTEENN(
    random_state = 19,
    smote = SMOTE(sampling_strategy = 0.5, k_neighbors = 10, random_state = 19),
    enn = EditedNearestNeighbours(n_neighbors = 8)
)

feature_resampled, label_resampled = smoteenn.fit_resample(feature, label)

campaign_data_resampled = feature_resampled
campaign_data_resampled['clicker'] = label_resampled

campaign_data_resampled.to_csv('../data/campaign-data-resampled.csv', index = False)

In [4]:
# UNDERSAMPLING

# Near Miss
def getNearMiss(feature, label, version, sampling_strategy, n_neighbors):

    near_miss = NearMiss(version = version, sampling_strategy = sampling_strategy, n_neighbors = n_neighbors)

    # Applicazione dell'undersampling al DataFrame
    feature_undersampled, label_undersampled = near_miss.fit_resample(feature, label)

    # Creazione di un nuovo DataFrame con i dati sottoposti a undersampling
    campaign_data_undersampled = pd.DataFrame(feature_undersampled, columns = feature.columns)
    campaign_data_undersampled['clicker'] = label_undersampled

    return campaign_data_undersampled

campaign_data_nm1 = getNearMiss(feature = feature, label = label, version = 1, sampling_strategy = 0.7, n_neighbors = 5)

campaign_data_nm1.to_csv('../data/campaign-data-undersampled.csv', index = False)


In [5]:
# OVERSAMPLING

# SMOTE
smote = SMOTE(sampling_strategy = 0.5, k_neighbors = 10, random_state = 19)

feature_oversampled, label_oversampled = smote.fit_resample(feature, label)

campaign_data_smote = feature_oversampled
campaign_data_smote['clicker'] = label_oversampled

campaign_data_smote.to_csv('../data/campaign-data-oversampled.csv', index = False)