In [1]:
!pip install -q imbalanced-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler,TomekLinks
from imblearn.combine import SMOTETomek

import warnings
warnings.filterwarnings('ignore')

# Load the dataset
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nClass Distribution:")
print(df['Class'].value_counts())
print("\nClass Distribution Percentage:")
print(df['Class'].value_counts(normalize=True) * 100)

Dataset Shape: (772, 31)

First few rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.2

In [2]:
X = df.drop('Class', axis=1)
y = df['Class']
print("\nImbalance Ratio:", y.value_counts()[0] / y.value_counts()[1])


Imbalance Ratio: 84.77777777777777


In [4]:
# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_balanced,y_balanced = smote.fit_resample(X, y)

print("\nBalanced class distribution:")
print(y_balanced.value_counts())
print("\nBalanced dataset shape:", X_balanced.shape)


Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64

Balanced dataset shape: (1526, 30)


In [5]:
# Calculate sample size using formula
# Sample size formula: n = (Z^2 * p * (1-p)) / E^2
# Where: Z = 1.96 (for 95% confidence), p = 0.5, E = 0.05 (margin of error)

import math
Z = 1.96  # 95% confidence level
p = 0.5   # proportion (maximum variability)
E = 0.05  # margin of error

sample_size = int((Z**2 * p * (1-p)) / E**2)
print(f"Calculated sample size: {sample_size}")

# Adjust if sample size is larger than our dataset
if sample_size > len(X_balanced):
    sample_size = int(len(X_balanced) * 0.3)  # Use 30% of balanced data

print(f"Sample size to be used: {sample_size}")

Calculated sample size: 384
Sample size to be used: 384


In [8]:
# Create 5 different samples using different sampling techniques

# Sampling Technique 1: Simple Random Sampling
def simple_random_sampling(X, y, n):
    indices = np.random.choice(len(X), size=n, replace=False)
    return X.iloc[indices], y.iloc[indices]

# Sampling Technique 2: Systematic Sampling
def systematic_sampling(X, y, n):
    k = len(X) // n
    start = np.random.randint(0, k)
    indices = np.arange(start, len(X), k)[:n]
    return X.iloc[indices], y.iloc[indices]

# Sampling Technique 3: Stratified Sampling
def stratified_sampling(X, y, n):
    from sklearn.model_selection import train_test_split
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=n, stratify=y, random_state=42)
    return X_sample, y_sample

# Sampling Technique 4: Cluster Sampling
def cluster_sampling(X, y, n):
    from sklearn.cluster import KMeans
    n_clusters = 5
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    samples_per_cluster = n // n_clusters
    indices = []
    for i in range(n_clusters):
        cluster_indices = np.where(clusters == i)[0]
        if len(cluster_indices) > 0:
            selected = np.random.choice(cluster_indices,
                                       size=min(samples_per_cluster, len(cluster_indices)),
                                       replace=False)
            indices.extend(selected)

    indices = indices[:n]
    return X.iloc[indices], y.iloc[indices]

# Sampling Technique 5: Bootstrap Sampling (with replacement)
def bootstrap_sampling(X, y, n):
    indices = np.random.choice(len(X), size=n, replace=True)
    return X.iloc[indices], y.iloc[indices]

# Create 5 samples
print("Creating 5 different samples...\n")

samples = {}
samples['Sample1_SimpleRandom'] = simple_random_sampling(X_balanced, y_balanced, sample_size)
samples['Sample2_Systematic'] = systematic_sampling(X_balanced, y_balanced, sample_size)
samples['Sample3_Stratified'] = stratified_sampling(X_balanced, y_balanced, sample_size)
samples['Sample4_Cluster'] = cluster_sampling(X_balanced, y_balanced, sample_size)
samples['Sample5_Bootstrap'] = bootstrap_sampling(X_balanced, y_balanced, sample_size)

for name, (X_sample, y_sample) in samples.items():
    print(f"{name}: Shape = {X_sample.shape}, Class distribution:")
    print(pd.Series(y_sample).value_counts())
    print()

Creating 5 different samples...

Sample1_SimpleRandom: Shape = (384, 30), Class distribution:
Class
0    197
1    187
Name: count, dtype: int64

Sample2_Systematic: Shape = (384, 30), Class distribution:
Class
0    255
1    129
Name: count, dtype: int64

Sample3_Stratified: Shape = (384, 30), Class distribution:
Class
0    192
1    192
Name: count, dtype: int64

Sample4_Cluster: Shape = (303, 30), Class distribution:
Class
1    153
0    150
Name: count, dtype: int64

Sample5_Bootstrap: Shape = (384, 30), Class distribution:
Class
1    195
0    189
Name: count, dtype: int64



In [9]:
# Define 5 ML Models

models = {
    'M1_LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'M2_DecisionTree': DecisionTreeClassifier(random_state=42),
    'M3_RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'M4_SVM': SVC(kernel='rbf', random_state=42),
    'M5_GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

print("Models defined:")
for model_name in models.keys():
    print(f"  - {model_name}")

Models defined:
  - M1_LogisticRegression
  - M2_DecisionTree
  - M3_RandomForest
  - M4_SVM
  - M5_GradientBoosting


In [11]:
# Train and evaluate all models on all samples

results = {}
print("Training and evaluating models...\n")
for sample_name, (X_sample, y_sample) in samples.items():
    print(f"\nProcessing {sample_name}...")
    results[sample_name] = {}
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42, stratify=y_sample)
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred) * 100
        results[sample_name][model_name] = accuracy

        print(f"  {model_name}: {accuracy:.2f}%")

print("\nTraining complete!")

Training and evaluating models...


Processing Sample1_SimpleRandom...
  M1_LogisticRegression: 91.38%
  M2_DecisionTree: 92.24%
  M3_RandomForest: 99.14%
  M4_SVM: 55.17%
  M5_GradientBoosting: 96.55%

Processing Sample2_Systematic...
  M1_LogisticRegression: 88.79%
  M2_DecisionTree: 90.52%
  M3_RandomForest: 99.14%
  M4_SVM: 67.24%
  M5_GradientBoosting: 98.28%

Processing Sample3_Stratified...
  M1_LogisticRegression: 91.38%
  M2_DecisionTree: 91.38%
  M3_RandomForest: 98.28%
  M4_SVM: 68.10%
  M5_GradientBoosting: 96.55%

Processing Sample4_Cluster...
  M1_LogisticRegression: 92.31%
  M2_DecisionTree: 92.31%
  M3_RandomForest: 98.90%
  M4_SVM: 67.03%
  M5_GradientBoosting: 95.60%

Processing Sample5_Bootstrap...
  M1_LogisticRegression: 92.24%
  M2_DecisionTree: 93.10%
  M3_RandomForest: 99.14%
  M4_SVM: 67.24%
  M5_GradientBoosting: 97.41%

Training complete!


In [12]:
# Create results table

results_df = pd.DataFrame(results).T
results_df.columns = ['M1', 'M2', 'M3', 'M4', 'M5']
results_df.index = ['Sampling1', 'Sampling2', 'Sampling3', 'Sampling4', 'Sampling5']
results_df = results_df.round(2)

print("\n" + "="*60)
print("RESULTS TABLE: Accuracy (%) of Models on Different Samples")
print("="*60)
print(results_df)
print("="*60)


RESULTS TABLE: Accuracy (%) of Models on Different Samples
              M1     M2     M3     M4     M5
Sampling1  91.38  92.24  99.14  55.17  96.55
Sampling2  88.79  90.52  99.14  67.24  98.28
Sampling3  91.38  91.38  98.28  68.10  96.55
Sampling4  92.31  92.31  98.90  67.03  95.60
Sampling5  92.24  93.10  99.14  67.24  97.41


In [13]:
# Find best sampling technique for each model

print("\n" + "="*60)
print("BEST SAMPLING TECHNIQUE FOR EACH MODEL")
print("="*60)

for model in results_df.columns:
    best_sampling = results_df[model].idxmax()
    best_accuracy = results_df[model].max()
    print(f"{model}: {best_sampling} with accuracy = {best_accuracy:.2f}%")

print("="*60)


BEST SAMPLING TECHNIQUE FOR EACH MODEL
M1: Sampling4 with accuracy = 92.31%
M2: Sampling5 with accuracy = 93.10%
M3: Sampling1 with accuracy = 99.14%
M4: Sampling3 with accuracy = 68.10%
M5: Sampling2 with accuracy = 98.28%


In [14]:
# Find best model for each sampling technique

print("\n" + "="*60)
print("BEST MODEL FOR EACH SAMPLING TECHNIQUE")
print("="*60)

for sampling in results_df.index:
    best_model = results_df.loc[sampling].idxmax()
    best_accuracy = results_df.loc[sampling].max()
    print(f"{sampling}: {best_model} with accuracy = {best_accuracy:.2f}%")

print("="*60)


BEST MODEL FOR EACH SAMPLING TECHNIQUE
Sampling1: M3 with accuracy = 99.14%
Sampling2: M3 with accuracy = 99.14%
Sampling3: M3 with accuracy = 98.28%
Sampling4: M3 with accuracy = 98.90%
Sampling5: M3 with accuracy = 99.14%


In [20]:
# Overall best combination

best_overall = results_df.max().max()
best_model_overall = results_df.max().idxmax()
best_sampling_overall = results_df[best_model_overall].idxmax()

print("\n" + "="*60)
print("OVERALL BEST COMBINATION")
print("="*60)
print(f"Best Model: {best_model_overall}")
print(f"Best Sampling: {best_sampling_overall}")
print(f"Accuracy: {best_overall:.2f}%")
print("="*60)


OVERALL BEST COMBINATION
Best Model: M3
Best Sampling: Sampling1
Accuracy: 99.14%


In [23]:
# Save results table
results_df.to_csv('sampling_results.csv')
print("\nResults saved to 'sampling_results.csv'")


Results saved to 'sampling_results.csv'
