In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score

In [29]:
# Load your dataset
df = pd.read_csv('https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv')

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = {'Time', 'Class'}, axis = 1), df['Class'], test_size=0.3, random_state=50)

In [31]:
y_train.value_counts()

0    534
1      6
Name: Class, dtype: int64

In [32]:
y_test.value_counts()

0    229
1      3
Name: Class, dtype: int64

In [33]:
# Define a dictionary of models to use
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

In [34]:
# Define a dictionary of sampling techniques to use
samplers = {
    'Random Over-Sampling': RandomOverSampler(),
    'Random Under-Sampling': RandomUnderSampler(),
    'SMOTE': SMOTE(),
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42),
    'Systematic Sampling': None,  # Replace None with the appropriate implementation
    # 'Cluster Sampling': KMeans(n_clusters=2, random_state=42)
}

In [35]:
# Create an empty list to store the results
results = []

In [36]:
# Loop through each model and each sampling technique
for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():
        # Apply the sampling technique to the training data
        if sampler_name == 'Stratified Sampling':
            # Stratified Sampling
            for train_idx, val_idx in sampler.split(X_train, y_train):
                X_resampled, y_resampled = X_train.iloc[train_idx], y_train.iloc[train_idx]
                X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        elif sampler_name == 'Systematic Sampling':
            # Systematic Sampling
            step = 4
            indices = list(range(0, len(X_train), step))
            X_resampled, y_resampled = X_train.iloc[indices], y_train.iloc[indices]
       
        else:
            # All other resampling techniques
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        # Fit the model to the resampled data
        model.fit(X_resampled, y_resampled)
      
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Compute the accuracy score of the model on the test data
        accuracy = accuracy_score(y_test, y_pred)
        # Append the results to the results list
        results.append({'Model': model_name, 'Sampler': sampler_name, 'Accuracy': accuracy})

In [37]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

In [38]:
results_df

Unnamed: 0,Model,Sampler,Accuracy
0,Decision Tree,Random Over-Sampling,0.974138
1,Decision Tree,Random Under-Sampling,0.762931
2,Decision Tree,SMOTE,0.982759
3,Decision Tree,Stratified Sampling,0.961207
4,Decision Tree,Systematic Sampling,0.956897
5,Random Forest,Random Over-Sampling,0.987069
6,Random Forest,Random Under-Sampling,0.646552
7,Random Forest,SMOTE,0.987069
8,Random Forest,Stratified Sampling,0.987069
9,Random Forest,Systematic Sampling,0.987069


In [40]:
# Reshape the DataFrame to have models as rows and samplers as columns
final = pd.pivot_table(results_df, values='Accuracy',index=['Model']  ,columns=['Sampler'])

In [41]:
final

Sampler,Random Over-Sampling,Random Under-Sampling,SMOTE,Stratified Sampling,Systematic Sampling
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree,0.974138,0.762931,0.982759,0.961207,0.956897
K-Nearest Neighbors,0.948276,0.491379,0.840517,0.987069,0.987069
Random Forest,0.987069,0.646552,0.987069,0.987069,0.987069
Support Vector Machine,0.551724,0.948276,0.418103,0.987069,0.987069
XGBoost,0.987069,0.396552,0.982759,0.987069,0.987069
