In [1]:
import pandas as pd
import numpy as np
import pickle
from imblearn.over_sampling import SMOTENC

In [2]:
# Load data
df = pd.read_pickle('data/train.pkl')

In [3]:
df_minority = df[df['fraud_bool'] == 1]
df_majority = df[df['fraud_bool'] == 0]

In [48]:
# Remove 30% of the majority class samples for synthetic validation set
majority_class_sampled = df_majority.sample(frac=0.3)
df = pd.concat([df_minority, majority_class_sampled])

In [None]:
# Random downsample of train set
print(len(df_minority))
print(len(df_majority))

majority_samples = df_majority.sample(len(df_minority))
train = pd.concat([df_minority, majority_samples])

# Fraud_bool classes
print(train['fraud_bool'].value_counts())

In [8]:
# Split into X and y for upsampling
X_Val = df.drop('fraud_bool', axis=1)
y_Val = df['fraud_bool']

X_train = train.drop('fraud_bool', axis=1)
y_train = train['fraud_bool']


In [9]:
# Find bool columns
bool_cols = X_Val.select_dtypes(include=['bool']).columns
bool_cols_ind = [X_Val.columns.get_loc(col) for col in bool_cols]


In [None]:
# Create synthetic validation set
# Synthetic minority over-sampling technique for nominal and continuous features (SMOTENC)
smote = SMOTENC(categorical_features = bool_cols_ind, random_state=42)
X_val_resampled, y_val_resampled = smote.fit_resample(X_Val, y_Val)
synthetic_val = pd.concat([y_val_resampled, X_val_resampled], axis=1)
print("Before SMOTENC:", y_Val.value_counts())
print("After SMOTENC:", pd.Series(y_val_resampled).value_counts())

In [None]:
# SMOTE upsampling train set
# Synthetic minority over-sampling technique for nominal and continuous features (SMOTENC)
smote = SMOTENC(categorical_features = bool_cols_ind, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
train = pd.concat([y_train, X_train], axis=1)
print("Before SMOTENC:", y_train.value_counts())
print("After SMOTENC:", pd.Series(y_train_resampled).value_counts())

In [13]:
# Remove all train set samples from val
synthetic_val = synthetic_val[~synthetic_val.isin(train).any(axis=1)]

In [14]:
# Convert bool to int
bool_columns = synthetic_val.select_dtypes(include='bool').columns
synthetic_val[bool_columns] = synthetic_val[bool_columns].astype(int)
train[bool_columns] = train[bool_columns].astype(int)

In [15]:
# Save synthetic_val and sampled_train
synthetic_val.to_pickle('data/smote_val.pkl')    
train.to_pickle('data/sampled_train.pkl')