In [1]:
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from fastkde import fastkde
from sklearn.ensemble import RandomForestClassifier


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### This code should not be run again. It was used to subset the original data.

In [2]:
# importing train data
train = pd.read_csv('data/train.csv')
train

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,527987,0,2,1,8,0,0,0,0,0,...,3,1,1,7,1,1,0,1,0,0
1,47519,0,3,1,3,0,0,1,0,0,...,3,2,3,5,0,0,1,1,1,1
2,938513,0,1,1,1,0,0,1,0,0,...,1,1,1,4,0,1,1,0,0,1
3,279774,0,0,1,2,0,0,1,0,0,...,6,0,5,3,0,1,1,0,0,0
4,232653,0,3,1,2,1,0,0,1,0,...,4,0,5,11,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59516,1207113,1,1,1,7,0,4,0,0,1,...,3,1,1,9,0,0,1,0,1,0
59517,1212728,1,0,1,5,0,0,0,1,0,...,6,2,1,2,0,1,1,0,0,0
59518,757821,1,3,1,7,0,1,1,0,0,...,3,1,3,8,0,0,0,1,0,0
59519,875118,1,0,3,5,1,0,0,1,0,...,7,2,4,6,0,1,1,0,0,0


In [3]:
##subsetting the train data to only be 10% of each class and saving it over the original data
#subset_train = train.groupby('target').sample(frac=0.1)
#subset_train['target'].value_counts()
#subset_train.to_csv('data/train.csv', index=False)
#subset_train

### Setting Up
Loading data, setting X and y, and defining a cross validation function that can be used for all methods.

In [4]:
train = pd.read_csv('data/train.csv', header = 0)

x = train.drop(columns=['target', 'id'])
y = train['target']

In [5]:
def crossvalidate(kf, x, y, model):
    cm_list = []
    accuracy = []
    recall = []
    for train_index, test_index in kf.split(x, y):
        xtrain, xtest = x[train_index], x[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        model.fit(xtrain, ytrain)
        y_pred = model.predict(xtest)
        accuracy.append(accuracy_score(ytest, y_pred))
        recall.append(recall_score(ytest, y_pred, average = 'binary'))
        cm_list.append(confusion_matrix(ytest, y_pred))
    return cm_list, accuracy, recall

### Synthetic Minority Oversampling

In [6]:
oversample = SMOTE()
x_sampled, y_sampled = oversample.fit_resample(x, y)
counter = Counter(y_sampled)

In [7]:
model = KNeighborsClassifier()
kf = StratifiedKFold(n_splits = 5)
scaler = StandardScaler()
x_sampled = scaler.fit_transform(x_sampled)
results, accuracy, recall = crossvalidate(kf, x_sampled, y_sampled, model)

In [8]:
accuracy, recall, results

([0.8852709123403514,
  0.8911555729915871,
  0.8881042674687242,
  0.8934658471731833,
  0.8900610287707061],
 [np.float64(0.997384481255449),
  np.float64(1.0),
  np.float64(0.9999128236422282),
  np.float64(1.0),
  np.float64(1.0)],
 [array([[ 8869,  2602],
         [   30, 11440]]),
  array([[ 8974,  2497],
         [    0, 11470]]),
  array([[ 8904,  2566],
         [    1, 11470]]),
  array([[ 9026,  2444],
         [    0, 11471]]),
  array([[ 8948,  2522],
         [    0, 11470]])])

### ADASYN with FastKDE

In [9]:
# converting X and y to numpy arrays
X = np.array(x) 
y = np.array(y) 

# applying FastKDE to the feature data to smooth and estimate the density
num_points = 257  # setting number of points for KDE
var_names = [f'feature_{i}' for i in range(X.shape[1])]  # assigning names to each feature

# calculating the pdf for each feature in the data, the density estimation for each feature is stored in kde_result
kde_result = {}
for i in range(X.shape[1]):
    kde_result[f'feature_{i}'] = fastkde.pdf(X[:, i], var_names=[var_names[i]], num_points=num_points)

# using ADASYN to generate synthetic data to balance the data
adasyn = ADASYN(sampling_strategy='minority', n_neighbors=5)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# splitting the resampled data using stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True)

# creating the RandomForest model
model = RandomForestClassifier()  # performs better than KNN

# calling the crossvalidate function with the resampled data
cm_list, accuracy, recall = crossvalidate(skf, X_resampled, y_resampled, model)

# printing average scores across all folds
print("Average Accuracy: ", np.mean(accuracy))
print("Average Recall: ", np.mean(recall))
print("Confusion Matrix (average across folds):\n", np.mean(cm_list, axis=0))


Average Accuracy:  0.9808586369029308
Average Recall:  0.9614056424597661
Confusion Matrix (average across folds):
 [[11470.4     0. ]
 [  435.6 10851. ]]


### Normalizing Flows

In [5]:
! pip install nflows

Collecting nflows
  Downloading nflows-0.14.tar.gz (45 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
Collecting grpcio>=1.48.2
  Downloading grpcio-1.67.1-cp39-cp39-win_amd64.whl (4.4 MB)
Collecting absl-py>=0.4
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Collecting protobuf!=4.24.0,>=3.19.6
  Downloading protobuf-5.28.3-cp39-cp39-win_amd64.whl (431 kB)
Building wheels for collected packages: nflows
  Building wheel for nflows (setup.py): started
  Building wheel for nflows (setup.py): finished with status 'done'
  Created wheel for nflows: filename=nflows-0.14-py3-none-any.whl size=53670 sha256=d393ae74ceb06b9755084afd78db497a74ead20727776c61fa3168ccd38cf608
  Stored in directory: c:\users\romer\appdata\local\pip\cache\wheels\3b\88\52\cbd4ed0597b48916de3de19b28d7297c72595f56085068c772
Successfully built nflows
Installi

In [None]:
import torch
import torch.distributions as dist
from nflows import transforms, flows

def generate_synthetic_minority_samples(minority_class_data, majority_class_data, imbalance_ratio=1.0, num_epochs=100, lr=1e-3):
    """
    Generates synthetic samples for a minority class using a simple normalizing flow.
    
    Parameters:
    - minority_class_data (Tensor): Tensor containing data for the minority class.
    - majority_class_data (Tensor): Tensor containing data for the majority class.
    - imbalance_ratio (float): Desired ratio of minority to majority class after resampling.
    - num_epochs (int): Number of training epochs for the normalizing flow.
    - lr (float): Learning rate for training the normalizing flow.

    Returns:
    - balanced_data (Tensor): Combined tensor of original and synthetic data to address class imbalance.
    """

    # Get dimensionality of the data (assuming minority_class_data is 2D tensor [samples, features])
    dim = minority_class_data.shape[1]
    
    # 1. Define a simple normalizing flow model (using MaskedAffineAutoregressiveTransform)
    transform = transforms.CompositeTransform([
        transforms.MaskedAffineAutoregressiveTransform(features=dim, hidden_features=dim * 2)
    ])
    base_distribution = dist.Normal(torch.zeros(dim), torch.ones(dim))
    flow_model = flows.Flow(transform, base_distribution)

    # 2. Train the flow model on the minority class data
    optimizer = torch.optim.Adam(flow_model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        # Calculate log probability (negative log likelihood)
        loss = -flow_model.log_prob(minority_class_data).mean()  # No context, just the data
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")

    # 3. Generate synthetic data samples
    num_samples = int(len(majority_class_data) * imbalance_ratio) - len(minority_class_data)
    synthetic_samples = flow_model.sample(num_samples)

    # 4. Combine the original majority class data with the synthetic minority class data
    balanced_data = torch.cat([majority_class_data, minority_class_data, synthetic_samples], dim=0)
    return balanced_data

# Example usage
if __name__ == "__main__":
    # Generate dummy minority and majority class data
    minority_class_data = torch.randn(50, 2)  # 50 samples, 2 features
    majority_class_data = torch.randn(150, 2)  # 150 samples, 2 features

    # Generate synthetic data to balance the dataset
    balanced_data = generate_synthetic_minority_samples(minority_class_data, majority_class_data, imbalance_ratio=2.0)

    print(f"Original majority class data: {majority_class_data.shape}")
    print(f"Original minority class data: {minority_class_data.shape}")
    print(f"Balanced data: {balanced_data.shape}")
x

In [26]:
balanced = generate_synthetic_minority_samples(torch.tensor(train[train['target'] == 1].values, dtype = torch.float32),
                                               torch.tensor(train[train['target'] == 0].values, dtype = torch.float32))

TypeError: log_prob() got an unexpected keyword argument 'context'