In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
url="https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df=pd.read_csv(url)

In [3]:
print("Dataset:")
print(df.head())

Dataset:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27  

In [5]:
print("Class Distribution:")
print(df['Class'].value_counts())

Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [6]:
df_class_0=df[df['Class']==0]
df_class_1=df[df['Class']==1]
df_class_0_sampled=df_class_0.sample(n=len(df_class_1),random_state=42)
df_balanced=pd.concat([df_class_0_sampled,df_class_1])
df_balanced=df_balanced.sample(frac=1,random_state=42)
print("Balanced class distribution:")
print(df_balanced['Class'].value_counts())

Balanced class distribution:
Class
0    9
1    9
Name: count, dtype: int64


In [7]:
X=df_balanced.drop('Class',axis=1)
y=df_balanced['Class']

In [8]:
models={
"M1_Logistic":LogisticRegression(max_iter=1000),
"M2_DecisionTree":DecisionTreeClassifier(),
"M3_RandomForest":RandomForestClassifier(),
"M4_KNN":KNeighborsClassifier(),
"M5_NaiveBayes":GaussianNB()
}

In [9]:
def evaluate_models(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
    result={}
    for name,model in models.items():
        model.fit(X_train,y_train)
        pred=model.predict(X_test)
        result[name]=round(accuracy_score(y_test,pred)*100,2)
    return result

S1:Simple Random

In [12]:
df_random=df_balanced.sample(frac=0.5,random_state=42)
X_r=df_random.drop('Class',axis=1)
y_r=df_random['Class']

S2: Bootstrap

In [13]:
df_bootstrap=df_balanced.sample(n=len(df_balanced),replace=True,random_state=42)
X_boot=df_bootstrap.drop('Class',axis=1)
y_boot=df_bootstrap['Class']

S3:Stratified

In [14]:
X_strat,_,y_strat,_=train_test_split(X,y,train_size=0.5,stratify=y,random_state=42)

S4: Cluster

In [15]:
df_balanced['Cluster']=df_balanced['Amount'].apply(lambda x:0 if x<df_balanced['Amount'].median() else 1)
df_cluster=df_balanced[df_balanced['Cluster']==1].drop('Cluster',axis=1)
X_c=df_cluster.drop('Class',axis=1)
y_c=df_cluster['Class']

S5:Cross Validation

In [16]:
cv_results={}
for name,model in models.items():
    scores=cross_val_score(model,X,y,cv=5,scoring='accuracy')
    cv_results[name]=round(scores.mean()*100,2)

In [20]:
results=pd.DataFrame({
"Simple Random":evaluate_models(X_r,y_r),
"Bootstrap":evaluate_models(X_boot,y_boot),
"Stratified":evaluate_models(X_strat,y_strat),
"Cluster":evaluate_models(X_c,y_c),
"Cross Validation":cv_results
})

In [21]:
print("Final Accuracy Table in percentage:")
print(results)

Final Accuracy Table in percentage:
                 Simple Random  Bootstrap  Stratified  Cluster  \
M1_Logistic              66.67      83.33        0.00    100.0   
M2_DecisionTree          33.33      83.33       66.67    100.0   
M3_RandomForest          66.67      50.00       66.67    100.0   
M4_KNN                   66.67      83.33       33.33    100.0   
M5_NaiveBayes            66.67      83.33       33.33    100.0   

                 Cross Validation  
M1_Logistic                 43.33  
M2_DecisionTree             68.33  
M3_RandomForest             63.33  
M4_KNN                      50.00  
M5_NaiveBayes               45.00  
