In [27]:
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/train.csv', header = 0)

x = train.drop(columns=['target', 'id'])
y = train['target']

In [33]:
def crossvalidate(kf, x, y, model):
    cm_list = []
    accuracy = []
    recall = []
    for train_index, test_index in kf.split(x, y):
        xtrain, xtest = x[train_index], x[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        model.fit(xtrain, ytrain)
        y_pred = model.predict(xtest)
        accuracy.append(accuracy_score(ytest, y_pred))
        recall.append(recall_score(ytest, y_pred, average = 'binary'))
        cm_list.append(confusion_matrix(ytest, y_pred))
    return cm_list, accuracy, recall

### Synthetic Minority Oversampling

In [29]:
oversample = SMOTE()
x_sampled, y_sampled = oversample.fit_resample(x, y)
counter = Counter(y_sampled)

In [None]:
model = KNeighborsClassifier()
kf = StratifiedKFold(n_splits = 5)
scaler = StandardScaler()
x_sampled = scaler.fit_transform(x_sampled)
results, accuracy, recall = crossvalidate(kf, x_sampled, y_sampled, model)

In [21]:
accuracy, recall, results

([0.8382372172093632,
  0.8415936532845124,
  0.84120134257443,
  0.8393705592607122,
  0.8403661726242372],
 [1.0, 1.0, 1.0, 1.0, 1.0],
 [array([[ 7760,  3711],
         [    0, 11470]], dtype=int64),
  array([[ 7837,  3634],
         [    0, 11470]], dtype=int64),
  array([[ 7827,  3643],
         [    0, 11471]], dtype=int64),
  array([[ 7785,  3685],
         [    0, 11471]], dtype=int64),
  array([[ 7808,  3662],
         [    0, 11470]], dtype=int64)])

### ADASYN with FastKDE

### Normalizing Flows

In [5]:
! pip install nflows

Collecting nflows
  Downloading nflows-0.14.tar.gz (45 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
Collecting grpcio>=1.48.2
  Downloading grpcio-1.67.1-cp39-cp39-win_amd64.whl (4.4 MB)
Collecting absl-py>=0.4
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Collecting protobuf!=4.24.0,>=3.19.6
  Downloading protobuf-5.28.3-cp39-cp39-win_amd64.whl (431 kB)
Building wheels for collected packages: nflows
  Building wheel for nflows (setup.py): started
  Building wheel for nflows (setup.py): finished with status 'done'
  Created wheel for nflows: filename=nflows-0.14-py3-none-any.whl size=53670 sha256=d393ae74ceb06b9755084afd78db497a74ead20727776c61fa3168ccd38cf608
  Stored in directory: c:\users\romer\appdata\local\pip\cache\wheels\3b\88\52\cbd4ed0597b48916de3de19b28d7297c72595f56085068c772
Successfully built nflows
Installi

In [None]:
import torch
import torch.distributions as dist
from nflows import transforms, flows

def generate_synthetic_minority_samples(minority_class_data, majority_class_data, imbalance_ratio=1.0, num_epochs=100, lr=1e-3):
    """
    Generates synthetic samples for a minority class using a simple normalizing flow.
    
    Parameters:
    - minority_class_data (Tensor): Tensor containing data for the minority class.
    - majority_class_data (Tensor): Tensor containing data for the majority class.
    - imbalance_ratio (float): Desired ratio of minority to majority class after resampling.
    - num_epochs (int): Number of training epochs for the normalizing flow.
    - lr (float): Learning rate for training the normalizing flow.

    Returns:
    - balanced_data (Tensor): Combined tensor of original and synthetic data to address class imbalance.
    """

    # Get dimensionality of the data (assuming minority_class_data is 2D tensor [samples, features])
    dim = minority_class_data.shape[1]
    
    # 1. Define a simple normalizing flow model (using MaskedAffineAutoregressiveTransform)
    transform = transforms.CompositeTransform([
        transforms.MaskedAffineAutoregressiveTransform(features=dim, hidden_features=dim * 2)
    ])
    base_distribution = dist.Normal(torch.zeros(dim), torch.ones(dim))
    flow_model = flows.Flow(transform, base_distribution)

    # 2. Train the flow model on the minority class data
    optimizer = torch.optim.Adam(flow_model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        # Calculate log probability (negative log likelihood)
        loss = -flow_model.log_prob(minority_class_data).mean()  # No context, just the data
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")

    # 3. Generate synthetic data samples
    num_samples = int(len(majority_class_data) * imbalance_ratio) - len(minority_class_data)
    synthetic_samples = flow_model.sample(num_samples)

    # 4. Combine the original majority class data with the synthetic minority class data
    balanced_data = torch.cat([majority_class_data, minority_class_data, synthetic_samples], dim=0)
    return balanced_data

# Example usage
if __name__ == "__main__":
    # Generate dummy minority and majority class data
    minority_class_data = torch.randn(50, 2)  # 50 samples, 2 features
    majority_class_data = torch.randn(150, 2)  # 150 samples, 2 features

    # Generate synthetic data to balance the dataset
    balanced_data = generate_synthetic_minority_samples(minority_class_data, majority_class_data, imbalance_ratio=2.0)

    print(f"Original majority class data: {majority_class_data.shape}")
    print(f"Original minority class data: {minority_class_data.shape}")
    print(f"Balanced data: {balanced_data.shape}")
x

In [26]:
balanced = generate_synthetic_minority_samples(torch.tensor(train[train['target'] == 1].values, dtype = torch.float32),
                                               torch.tensor(train[train['target'] == 0].values, dtype = torch.float32))

TypeError: log_prob() got an unexpected keyword argument 'context'