In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv')

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = {'Time', 'Class'}, axis = 1), df['Class'], test_size=0.3, random_state=50)

In [5]:
y_train.value_counts()

0    534
1      6
Name: Class, dtype: int64

In [6]:
y_test.value_counts()

0    229
1      3
Name: Class, dtype: int64

In [7]:
# Define a dictionary of models to use
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

In [8]:
# Define a dictionary of sampling techniques to use
samplers = {
    'Random Over-Sampling': RandomOverSampler(),
    'Random Under-Sampling': RandomUnderSampler(),
    'SMOTE': SMOTE(),
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42),
    'Systematic Sampling': None,  # Replace None with the appropriate implementation
    # 'Cluster Sampling': KMeans(n_clusters=2, random_state=42)
}

In [9]:
# Create an empty list to store the results
results = []

In [20]:
# Loop through each model and each sampling technique
for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():
        # Apply the sampling technique to the training data
        if sampler_name == 'Stratified Sampling':
            # Stratified Sampling
            for train_idx, val_idx in sampler.split(X_train, y_train):
                X_resampled, y_resampled = X_train.iloc[train_idx], y_train.iloc[train_idx]
                X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        elif sampler_name == 'Systematic Sampling':
            # Systematic Sampling
            step = 4
            indices = list(range(0, len(X_train), step))
            X_resampled, y_resampled = X_train.iloc[indices], y_train.iloc[indices]
       
        else:
            # All other resampling techniques
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        # Fit the model to the resampled data
        model.fit(X_resampled, y_resampled)
      
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Compute the accuracy score of the model on the test data
        accuracy = accuracy_score(y_test, y_pred)
        # Append the results to the results list
        results.append({'Model': model_name, 'Sampler': sampler_name, 'Accuracy': accuracy})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

In [22]:
results_df

Unnamed: 0,Model,Sampler,Accuracy
0,Logistic Regression,Random Over-Sampling,0.870690
1,Logistic Regression,Random Under-Sampling,0.474138
2,Logistic Regression,SMOTE,0.875000
3,Logistic Regression,Stratified Sampling,0.987069
4,Logistic Regression,Systematic Sampling,0.987069
...,...,...,...
70,XGBoost,Random Over-Sampling,0.987069
71,XGBoost,Random Under-Sampling,0.448276
72,XGBoost,SMOTE,0.982759
73,XGBoost,Stratified Sampling,0.987069


In [26]:
# Reshape the DataFrame to have models as rows and samplers as columns
table = pd.pivot_table(results_df, values='Accuracy',index=['Model']  ,columns=['Sampler'])

In [27]:
table

Sampler,Random Over-Sampling,Random Under-Sampling,SMOTE,Stratified Sampling,Systematic Sampling
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
K-Nearest Neighbors,0.948276,0.522989,0.803161,0.987069,0.987069
Logistic Regression,0.875,0.538793,0.875,0.987069,0.979885
Random Forest,0.987069,0.58046,0.987069,0.987069,0.987069
Support Vector Machine,0.547414,0.416667,0.428161,0.987069,0.987069
XGBoost,0.987069,0.488506,0.982759,0.987069,0.987069
