##LOADING DATASET AND DATA CLEANING##

In [1]:
## 1) Downloading and viewing the dataset

import pandas as pd

# Load the dataset from the provided URL
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Check class distribution
print(data['Class'].value_counts())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

##CONVERTING IMBALANCED DATASET TO BALANCED DATASET USING SMOTE##

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Splitting data into features and target variable
X = data.drop(['Class'], axis=1)
y = data['Class']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Applying SMOTE to balance the class distribution in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Displaying the class distribution before and after applying SMOTE
print("Original Class Distribution:\n", y.value_counts())
print("Resampled Class Distribution:\n", pd.Series(y_train_resampled).value_counts())

Original Class Distribution:
 Class
0    763
1      9
Name: count, dtype: int64
Resampled Class Distribution:
 Class
1    534
0    534
Name: count, dtype: int64


##CREATING 5 SAMPLES, ASSUMING SAMPLE SIZES: 100, 200, 300, 400, 500

In [30]:
##Creating 5 samples

# Assuming desired sample sizes for different sampling techniques
sample_sizes = [100, 200, 300, 400, 500]

# Creating samples from the resampled dataset
samples = [X_train_resampled[:size] for size in sample_sizes]
samples_labels = [y_train_resampled[:size] for size in sample_sizes]

# Displaying sample sizes
for i, sample in enumerate(samples):
    print(f"Sample {i+1} Size: {len(sample)}")


Sample 1 Size: 100
Sample 2 Size: 200
Sample 3 Size: 300
Sample 4 Size: 400
Sample 5 Size: 500


##INITIALIZING DIFFERENT MODELS##

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.cluster import KMeans
from sklearn.utils import resample
import pandas as pd
import numpy as np

# Defining models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC()
}


In [33]:
X = data.drop('Class', axis=1)
y = data['Class']

# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


##SAMPLING TECHNIQUES##

In [42]:
# SMOTE Resampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Under-sampling (RandomUnderSampler)
under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

# Cluster Sampling (using KMeans to generate clusters, then sampling from them)
kmeans = KMeans(n_clusters=5, random_state=42)
X_train_clustered = X_train.copy()
X_train_clustered['Cluster'] = kmeans.fit_predict(X_train)

# Random sampling (Randomly sampling from the training set)
X_train_random, y_train_random = resample(X_train, y_train, n_samples=len(y_train), random_state=42)



##APPLYING SAMPLING TECHNIQUE IN DIFFERENT MODELS##

In [43]:
accuracies = {}

def evaluate_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Evaluating each model with each sampling technique
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    # Random Sampling
    accuracies[f'{model_name} - Random Sampling'] = evaluate_model(X_train_random, y_train_random, X_test, y_test, model)

    # Stratified Sampling
    accuracies[f'{model_name} - Stratified Sampling'] = evaluate_model(X_train, y_train, X_test, y_test, model)

    # Cluster Sampling (Ensure that the clusters and features are consistent with the training set)
    accuracies[f'{model_name} - Cluster Sampling'] = evaluate_model(X_train_clustered.drop('Cluster', axis=1), y_train, X_test, y_test, model)

    # Under-sampling
    accuracies[f'{model_name} - Under-sampling'] = evaluate_model(X_train_under, y_train_under, X_test, y_test, model)

    # SMOTE
    accuracies[f'{model_name} - SMOTE'] = evaluate_model(X_train_resampled, y_train_resampled, X_test, y_test, model)

# Printing the accuracies for each model and sampling technique
for key, value in accuracies.items():
    print(f"{key}: {value}")

Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Decision Tree...
Evaluating Random Forest...
Evaluating KNN...
Evaluating SVC...
Logistic Regression - Random Sampling: 0.9806451612903225
Logistic Regression - Stratified Sampling: 0.9935483870967742
Logistic Regression - Cluster Sampling: 0.9935483870967742
Logistic Regression - Under-sampling: 0.5741935483870968
Logistic Regression - SMOTE: 0.8774193548387097
Decision Tree - Random Sampling: 0.9741935483870968
Decision Tree - Stratified Sampling: 0.9870967741935484
Decision Tree - Cluster Sampling: 0.9806451612903225
Decision Tree - Under-sampling: 0.5741935483870968
Decision Tree - SMOTE: 0.9548387096774194
Random Forest - Random Sampling: 0.9935483870967742
Random Forest - Stratified Sampling: 0.9935483870967742
Random Forest - Cluster Sampling: 0.9935483870967742
Random Forest - Under-sampling: 0.7548387096774194
Random Forest - SMOTE: 0.9935483870967742
KNN - Random Sampling: 0.9935483870967742
KNN - Stratified Sampling: 0.9935483870967742
KNN - Cluster Sampling: 0.99

##PRODUCING RESULT IN TABLE FORM##

In [44]:
import pandas as pd

# Defining the accuracies for each model and sampling technique
data = {
    "Random_Sampling": [0.980645, 0.974194, 0.993548, 0.993548, 0.993548],
    "Stratified_Sampling": [0.993548, 0.987097, 0.993548, 0.993548, 0.993548],
    "Cluster_Sampling": [0.993548, 0.980645, 0.993548, 0.993548, 0.993548],
    "Under_Sampling": [0.574193, 0.574194, 0.754839, 0.670968, 0.625806],
    "SMOTE": [0.877419, 0.954839, 0.993548, 0.754839, 0.670968]
}

# Defining the model names
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'KNN', 'SVC']

# Creating the DataFrame
df = pd.DataFrame(data, index=models)

# Displaying the table
print(df)


                     Random_Sampling1  Stratified_Sampling2  Cluster_Sampling  \
Logistic Regression          0.980645              0.993548          0.993548   
Decision Tree                0.974194              0.987097          0.980645   
Random Forest                0.993548              0.993548          0.993548   
KNN                          0.993548              0.993548          0.993548   
SVC                          0.993548              0.993548          0.993548   

                     Under_Sampling     SMOTE  
Logistic Regression        0.574193  0.877419  
Decision Tree              0.574194  0.954839  
Random Forest              0.754839  0.993548  
KNN                        0.670968  0.754839  
SVC                        0.625806  0.670968  


##HIGHEST ACCURACIES RESULT##

In [45]:
# Finding the sampling technique with the highest accuracy for each model
highest_accuracy = df.idxmax(axis=1)
max_values = df.max(axis=1)

# Combining the results into a summary DataFrame
summary = pd.DataFrame({
    "Model": df.index,
    "Best Sampling Technique": highest_accuracy.values,
    "Highest Accuracy": max_values.values
})

# Displaying the summary
print(summary)


                 Model Best Sampling Technique  Highest Accuracy
0  Logistic Regression    Stratified_Sampling2          0.993548
1        Decision Tree    Stratified_Sampling2          0.987097
2        Random Forest        Random_Sampling1          0.993548
3                  KNN        Random_Sampling1          0.993548
4                  SVC        Random_Sampling1          0.993548
