In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load your dataset
df = pd.read_csv('Creditcard_data.csv')

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = {'Time', 'Class'}, axis = 1), df['Class'], test_size=0.3, random_state=42)

In [4]:
y_train.value_counts()

0    534
1      6
Name: Class, dtype: int64

In [5]:
y_test.value_counts()

0    229
1      3
Name: Class, dtype: int64

In [7]:
# Define a dictionary of models to use
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'extratreeclassifier': ExtraTreesClassifier()
}

In [8]:
# Define a dictionary of sampling techniques to use
samplers = {
    'Random Over-Sampling': RandomOverSampler(),
    'Random Under-Sampling': RandomUnderSampler(),
    'SMOTE': SMOTE(),
    'ADASYN': ADASYN(),
    'Tomek Links': SMOTETomek(),
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42),
    'Systematic Sampling': None,  # Replace None with the appropriate implementation
    # 'Cluster Sampling': KMeans(n_clusters=2, random_state=42)
}

In [9]:
# Create an empty list to store the results
results = []

In [10]:
# Loop through each model and each sampling technique
for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():
        # Apply the sampling technique to the training data
        if sampler_name == 'Stratified Sampling':
            # Stratified Sampling
            for train_idx, val_idx in sampler.split(X_train, y_train):
                X_resampled, y_resampled = X_train.iloc[train_idx], y_train.iloc[train_idx]
                X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        elif sampler_name == 'Systematic Sampling':
            # Systematic Sampling
            step = 2
            indices = list(range(0, len(X_train), step))
            X_resampled, y_resampled = X_train.iloc[indices], y_train.iloc[indices]
        # elif sampler_name == 'Cluster Sampling':
        #     # Cluster Sampling
        #     kmeans = sampler.fit(X_train)
        #     labels = kmeans.predict(X_train)
        #     cluster1_indices = [i for i, label in enumerate(labels) if label == 0]
        #     cluster2_indices = [i for i, label in enumerate(labels) if label == 1]
        #     if len(cluster1_indices) > len(cluster2_indices):
        #         indices_to_remove = cluster1_indices[len(cluster2_indices):]
        #         X_resampled = X_train.drop(indices_to_remove)
        #         y_resampled = y_train.drop(indices_to_remove)
        #     else:
        #         indices_to_remove = cluster2_indices[len(cluster1_indices):]
        #         X_resampled = X_train.drop(indices_to_remove)
        #         y_resampled = y_train.drop(indices_to_remove)
        else:
            # All other resampling techniques
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        # Fit the model to the resampled data
        model.fit(X_resampled, y_resampled)
      
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Compute the accuracy score of the model on the test data
        accuracy = accuracy_score(y_test, y_pred)
        # Append the results to the results list
        results.append({'Model': model_name, 'Sampler': sampler_name, 'Accuracy': accuracy})

In [11]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

In [12]:
results_df

Unnamed: 0,Model,Sampler,Accuracy
0,Logistic Regression,Random Over-Sampling,0.939655
1,Logistic Regression,Random Under-Sampling,0.758621
2,Logistic Regression,SMOTE,0.931034
3,Logistic Regression,ADASYN,0.931034
4,Logistic Regression,Tomek Links,0.926724
5,Logistic Regression,Stratified Sampling,0.982759
6,Logistic Regression,Systematic Sampling,0.982759
7,Random Forest,Random Over-Sampling,0.987069
8,Random Forest,Random Under-Sampling,0.577586
9,Random Forest,SMOTE,0.987069
