In [1]:
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from fastkde import fastkde
from sklearn.ensemble import RandomForestClassifier


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

#### This code should not be run again. It was used to subset the original data.

In [2]:
# importing train data
train = pd.read_csv('data/train.csv')
train

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,527987,0,2,1,8,0,0,0,0,0,...,3,1,1,7,1,1,0,1,0,0
1,47519,0,3,1,3,0,0,1,0,0,...,3,2,3,5,0,0,1,1,1,1
2,938513,0,1,1,1,0,0,1,0,0,...,1,1,1,4,0,1,1,0,0,1
3,279774,0,0,1,2,0,0,1,0,0,...,6,0,5,3,0,1,1,0,0,0
4,232653,0,3,1,2,1,0,0,1,0,...,4,0,5,11,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59516,1207113,1,1,1,7,0,4,0,0,1,...,3,1,1,9,0,0,1,0,1,0
59517,1212728,1,0,1,5,0,0,0,1,0,...,6,2,1,2,0,1,1,0,0,0
59518,757821,1,3,1,7,0,1,1,0,0,...,3,1,3,8,0,0,0,1,0,0
59519,875118,1,0,3,5,1,0,0,1,0,...,7,2,4,6,0,1,1,0,0,0


In [3]:
##subsetting the train data to only be 10% of each class and saving it over the original data
#subset_train = train.groupby('target').sample(frac=0.1)
#subset_train['target'].value_counts()
#subset_train.to_csv('data/train.csv', index=False)
#subset_train

### Setting Up
Loading data, setting X and y, and defining a cross validation function that can be used for all methods.

In [2]:
train = pd.read_csv('data/train.csv', header = 0)

x = train.drop(columns=['target', 'id'])
y = train['target']

In [3]:
def crossvalidate(kf, x, y, model):
    cm_list = []
    accuracy = []
    recall = []
    for train_index, test_index in kf.split(x, y):
        xtrain, xtest = x[train_index], x[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        model.fit(xtrain, ytrain)
        y_pred = model.predict(xtest)
        accuracy.append(accuracy_score(ytest, y_pred))
        recall.append(recall_score(ytest, y_pred, average = 'binary'))
        cm_list.append(confusion_matrix(ytest, y_pred))
    return cm_list, accuracy, recall

### Synthetic Minority Oversampling

In [4]:
oversample = SMOTE()
x_sampled, y_sampled = oversample.fit_resample(x, y)
counter = Counter(y_sampled)

In [5]:
model = KNeighborsClassifier()
kf = StratifiedKFold(n_splits = 5)
scaler = StandardScaler()
x_sampled = scaler.fit_transform(x_sampled)
results, accuracy, recall = crossvalidate(kf, x_sampled, y_sampled, model)

In [6]:
accuracy, recall, results

([0.883876029815614,
  0.8906324920448107,
  0.8907632622815047,
  0.8937273876465717,
  0.8898866608544028],
 [np.float64(0.9970357454228422),
  np.float64(1.0),
  np.float64(0.9999128236422282),
  np.float64(1.0),
  np.float64(1.0)],
 [array([[ 8841,  2630],
         [   34, 11436]]),
  array([[ 8962,  2509],
         [    0, 11470]]),
  array([[ 8965,  2505],
         [    1, 11470]]),
  array([[ 9032,  2438],
         [    0, 11471]]),
  array([[ 8944,  2526],
         [    0, 11470]])])

### ADASYN with FastKDE

In [8]:
# converting X and y to numpy arrays
X = np.array(x) 
y = np.array(y) 

# applying FastKDE to the feature data to smooth and estimate the density
num_points = 257  # setting number of points for KDE
var_names = [f'feature_{i}' for i in range(X.shape[1])]  # assigning names to each feature

# calculating the pdf for each feature in the data, the density estimation for each feature is stored in kde_result
kde_result = {}
for i in range(X.shape[1]):
    kde_result[f'feature_{i}'] = fastkde.pdf(X[:, i], var_names=[var_names[i]], num_points=num_points)

# using ADASYN to generate synthetic data to balance the data
adasyn = ADASYN(sampling_strategy='minority', n_neighbors=5)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# splitting the resampled data using stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True)

# creating the RandomForest model
model = RandomForestClassifier()  # performs better than KNN

# calling the crossvalidate function with the resampled data
cm_list, accuracy, recall = crossvalidate(skf, X_resampled, y_resampled, model)

# printing average scores across all folds
print("Average Accuracy: ", np.mean(accuracy))
print("Average Recall: ", np.mean(recall))
print("Confusion Matrix (average across folds):\n", np.mean(cm_list, axis=0))

# printing all scores for all folds
print("Accuracy: ", accuracy)
print("Recall: ", recall)
print("Confusion Matrix:\n", cm_list)

Average Accuracy:  0.9808322713890231
Average Recall:  0.9613524902390063
Confusion Matrix (average across folds):
 [[11470.4     0. ]
 [  436.2 10850.4]]
Accuracy:  [0.9825108757744869, 0.9816759678340731, 0.9796546117678077, 0.9800500944764249, 0.9802698070923233]
Recall:  [np.float64(0.9647350699982279), np.float64(0.9630515683147262), np.float64(0.9589793567821387), np.float64(0.9597767342960929), np.float64(0.9602197218038452)]
Confusion Matrix:
 [array([[11471,     0],
       [  398, 10888]]), array([[11471,     0],
       [  417, 10869]]), array([[11470,     0],
       [  463, 10824]]), array([[11470,     0],
       [  454, 10833]]), array([[11470,     0],
       [  449, 10838]])]


### Normalizing Flows

In [9]:
# reimporting data to ensure original dataset is used
train = pd.read_csv('data/train.csv', header = 0)

X = train.drop(columns=['target', 'id'])
y = train['target']

In [11]:
# Define the normalizing flow model (basic version)
class SimpleFlow(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SimpleFlow, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, input_dim)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def generate_samples(self, n_samples):
        z = torch.randn(n_samples, self.fc1.in_features)
        return self.forward(z)

def generate_minority_samples(X_minority, n_samples):
    # Ensure n_samples is non-negative
    n_samples = max(0, n_samples)
    
    # If no samples need to be generated, return an empty array
    if n_samples == 0:
        return np.empty((0, X_minority.shape[1]))
    
    # Convert DataFrame to NumPy array
    X_minority_np = X_minority.values if isinstance(X_minority, pd.DataFrame) else X_minority
    
    input_dim = X_minority_np.shape[1]
    flow = SimpleFlow(input_dim, hidden_dim=64)
    optimizer = torch.optim.Adam(flow.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    X_tensor = torch.tensor(X_minority_np, dtype=torch.float32)
    
    # Training the normalizing flow model on the minority class data
    for epoch in range(200):  # Adjust epochs as needed for convergence
        optimizer.zero_grad()
        output = flow(X_tensor)
        loss = loss_fn(output, X_tensor)
        loss.backward()
        optimizer.step()

    # Generate synthetic samples
    synthetic_samples = flow.generate_samples(n_samples).detach().numpy()
    return synthetic_samples

# Assuming X and y are already defined
# Separate the minority class
class_counts = y.value_counts()
minority_class = class_counts.index[-1]
majority_class = class_counts.index[0]
X_minority = X[y == minority_class]
X_majority = X[y == majority_class]

# Number of synthetic samples to generate (match majority class size)
n_minority_to_generate = X_majority.shape[0] - X_minority.shape[0]
print(f"Generating {n_minority_to_generate} synthetic samples for the minority class")

synthetic_samples = generate_minority_samples(X_minority, n_minority_to_generate)

# Combine the synthetic samples with the original dataset
X_balanced = np.vstack([X.values, synthetic_samples])
y_balanced = np.hstack([y.values, np.full(synthetic_samples.shape[0], minority_class)])

print(f"Original dataset shape: {X.shape}")
print(f"Balanced dataset shape: {X_balanced.shape}")

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True)

# Initialize RandomForestClassifier
model = RandomForestClassifier()

# Perform cross-validation
conf_matrices, accuracies, recalls = crossvalidate(skf, X_balanced, y_balanced, model)

# Display results for each fold
for fold, cm in enumerate(conf_matrices, 1):
    print(f"\nFold {fold} Confusion Matrix:\n{cm}")
    print(f"Accuracy: {accuracies[fold - 1]:.4f}")
    print(f"Recall for Minority Class: {recalls[fold - 1]:.4f}")

# Display overall results
print("\nOverall Results:")
print(f"Stratified K-Fold Cross-Validated Accuracy: {np.mean(accuracies):.4f}")
print(f"Stratified K-Fold Cross-Validated Recall for Minority Class: {np.mean(recalls):.4f}")


Generating 55183 synthetic samples for the minority class
Original dataset shape: (59521, 57)
Balanced dataset shape: (114704, 57)

Fold 1 Confusion Matrix:
[[11471     0]
 [  427 11043]]
Accuracy: 0.9814
Recall for Minority Class: 0.9628

Fold 2 Confusion Matrix:
[[11470     1]
 [  452 11018]]
Accuracy: 0.9803
Recall for Minority Class: 0.9606

Fold 3 Confusion Matrix:
[[11470     0]
 [  441 11030]]
Accuracy: 0.9808
Recall for Minority Class: 0.9616

Fold 4 Confusion Matrix:
[[11470     0]
 [  433 11038]]
Accuracy: 0.9811
Recall for Minority Class: 0.9623

Fold 5 Confusion Matrix:
[[11470     0]
 [  416 11054]]
Accuracy: 0.9819
Recall for Minority Class: 0.9637

Overall Results:
Stratified K-Fold Cross-Validated Accuracy: 0.9811
Stratified K-Fold Cross-Validated Recall for Minority Class: 0.9622
