In [10]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [11]:
# Create an imbalanced dataset
X, y = make_classification(
    n_samples=5000,
    n_features=10,
    n_classes=2,
    weights=[0.95, 0.05],  # 95% majority, 5% minority
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
#pip install imblearn

# 1. Oversampling – Using SMOTE

In [14]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_res))

Before SMOTE: [3324  176]
After SMOTE: [3324 3324]


# 2. Undersampling – Using RandomUnderSampler

In [15]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

print("Before undersampling:", np.bincount(y_train))
print("After undersampling:", np.bincount(y_res))

Before undersampling: [3324  176]
After undersampling: [176 176]


# 3. Combination – SMOTE + Tomek Links

In [16]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_res, y_res = smt.fit_resample(X_train, y_train)

print("Before:", np.bincount(y_train))
print("After:", np.bincount(y_res))

Before: [3324  176]
After: [3319 3319]
