In [1]:
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np


## DATA

In [2]:
df = pd.read_csv('/content/credit.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
from imblearn.combine import SMOTEENN
import pandas as pd

In [4]:
# Normalize the 'Amount' feature
amount = normalize([df['Amount']])[0]
df['Amount'] = amount

# Remove the 'Time' feature and set 'Class' as the target variable
df = df.drop('Time', axis=1)


# CALCULATING FRAUDS IN DATASET

In [5]:
#Calculate the percentage of fraud and non-fraud cases in the dataset
num_no_frauds = df['Class'].value_counts()[0]
num_frauds = df['Class'].value_counts()[1]

percent_no_frauds = round(num_no_frauds / len(df) * 100, 2)
percent_frauds = round(num_frauds / len(df) * 100, 2)

# Print the results
print('No Frauds:', percent_no_frauds, '% of the dataset')
print('Frauds:', percent_frauds, '% of the dataset')

No Frauds: 98.83 % of the dataset
Frauds: 1.17 % of the dataset


In [6]:
# Print the number of classes and their frequencies
class_freq = df['Class'].value_counts()
print(f"Number of classes: {len(class_freq)}")
print("Class frequencies:")
print(class_freq)

Number of classes: 2
Class frequencies:
0    763
1      9
Name: Class, dtype: int64


## RESAMPLING

In [7]:
# Resample using SMOTEENN
resample = SMOTEENN()
Xsmee, ysmee = resample.fit_resample(df.drop('Class', axis=1), df['Class'])
df_smee = pd.DataFrame(np.hstack((Xsmee, ysmee[:, None])), columns=df.columns)
df_smee['Class'] = df_smee['Class'].astype(int)

  df_smee = pd.DataFrame(np.hstack((Xsmee, ysmee[:, None])), columns=df.columns)


In [8]:
# Print the percentage of frauds and non-frauds
print('Percentage of Non-Frauds: {:.2f}%'.format(df_smee['Class'].value_counts(normalize=True)[0]*100))
print('Percentage of Frauds: {:.2f}%'.format(df_smee['Class'].value_counts(normalize=True)[1]*100))


Percentage of Non-Frauds: 48.48%
Percentage of Frauds: 51.52%


In [9]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Separate features and target
X = df_smee.drop('Class', axis=1)
y = df_smee['Class']
# Separate features and target
X = df_smee.drop('Class', axis=1)
y = df_smee['Class']

# Instantiate oversampler and undersampler
oversampler = RandomOverSampler()
undersampler = RandomUnderSampler()

# Resample using both oversampling and undersampling
X_over, y_over = oversampler.fit_resample(X, y)
X_resampled, y_resampled = undersampler.fit_resample(X_over, y_over)

# Print the number of samples in each class
print("Number of samples in each class after resampling:")
print(y_resampled.value_counts())

Number of samples in each class after resampling:
0    763
1    763
Name: Class, dtype: int64


In [10]:
new_df = pd.concat([X_resampled, y_resampled], axis=1)
new_df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
2,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
3,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.012036,0
4,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,0.000631,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,0.654212,0.459679,0.380802,0.397261,0.243939,-0.395308,0.197239,-0.032036,-0.055621,-0.212935,...,-0.206286,-0.597073,0.139562,-0.526847,-0.599474,0.084719,0.107224,0.132475,0.000196,1
1522,0.066989,0.232210,0.653516,0.157479,0.141278,-0.761332,0.159125,-0.066536,0.030515,-0.196818,...,-0.244715,-0.578451,-0.026668,0.110491,-0.002706,0.152490,-0.151046,-0.133579,0.000347,1
1523,0.785682,0.430019,0.361827,0.463486,0.102925,-0.545293,0.170004,-0.066869,-0.027681,-0.227012,...,-0.224507,-0.653654,0.137190,-0.337122,-0.421534,0.087042,0.077967,0.110041,0.000346,1
1524,-0.019976,0.231695,0.682233,0.116121,0.187355,-0.736707,0.168816,-0.060655,0.018297,-0.192367,...,-0.241096,-0.552598,-0.038447,0.095544,-0.016040,0.156620,-0.159899,-0.145812,0.000201,1


# Simple random sampling

In [11]:
# Simple random sampling
n = int((1.96*1.96 * 0.5*0.5)/(0.05**2))
sampled_df = new_df.sample(n=n, random_state=42)
print(sampled_df)

            V1        V2        V3        V4        V5        V6        V7  \
1439 -1.453300  1.570100 -1.148676  3.202179 -0.482634 -1.340155 -1.904212   
76   -0.671709  0.594503  0.216416 -0.867311  2.966908  3.632495  0.563295   
1010 -1.883300  0.531879  0.481458  0.517591  0.556633 -0.578063 -0.438928   
660  -0.866655  1.029621 -0.410810  0.159946  2.372236  3.801113 -0.088801   
1132  0.102582  0.546457  0.448330  0.129103  0.793947  0.219086  0.291356   
...        ...       ...       ...       ...       ...       ...       ...   
1481 -0.107222  0.971613 -0.425941  1.955146 -0.420647 -1.204767 -0.911922   
756  -0.284446  1.069099  0.930511 -0.129010  0.387819 -0.513675  0.664176   
1074 -0.024386  0.378362  1.145573  0.393981  0.412552 -0.971069  0.551053   
867   1.252860  0.350636  0.302747  0.692110 -0.369394 -1.067959  0.087145   
485   1.248022  0.367821  0.058192  0.933295  0.443929  0.226403  0.115654   

            V8        V9       V10  ...       V21       V22    

In [12]:
print('Percentage of No Frauds: {}%'.format(round(sampled_df.Class.value_counts()[0]/len(sampled_df) * 100.0,2)))
print('Percentage of Frauds: {}%'.format(round(sampled_df.Class.value_counts()[1]/len(sampled_df) * 100.0,2)))

Percentage of No Frauds: 46.09%
Percentage of Frauds: 53.91%


# Systematic sampling

In [13]:
# Systematic sampling
interval = 2
systematic_df = new_df.iloc[::interval]
print(systematic_df)

            V1        V2        V3        V4        V5        V6        V7  \
0    -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
2    -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4    -0.425966  0.960523  1.141109 -0.168252  0.420987 -0.029728  0.476201   
6    -0.644269  1.417964  1.074380 -0.492199  0.948934  0.428118  1.120631   
8    -0.338262  1.119593  1.044367 -0.222187  0.499361 -0.246761  0.651583   
...        ...       ...       ...       ...       ...       ...       ...   
1516  0.418670  0.492381  0.408240  0.283881  0.473975 -0.134690  0.235098   
1518 -1.833034 -0.865950  1.648558 -0.008902  1.476326  0.027652 -0.588693   
1520  1.200905  0.279694  0.185773  0.481465  0.002622 -0.217689 -0.055027   
1522  0.066989  0.232210  0.653516  0.157479  0.141278 -0.761332  0.159125   
1524 -0.019976  0.231695  0.682233  0.116121  0.187355 -0.736707  0.168816   

            V8        V9       V10  ...       V21       V22    

In [14]:
print('Percentage of No Frauds: {}%'.format(round(systematic_df.Class.value_counts()[0]/len(systematic_df) * 100.0,2)))
print('Percentage of Frauds: {}%'.format(round(systematic_df.Class.value_counts()[1]/len(systematic_df) * 100.0,2)))

Percentage of No Frauds: 50.07%
Percentage of Frauds: 49.93%


# Stratified sampling

In [15]:
# Stratified sampling
n = int((1.96*1.96 * 0.5*0.5)/((0.05)**2))
strata = new_df.groupby('Class')
stratified_df = strata.apply(lambda x: x.sample(n))
print(stratified_df)

                  V1        V2        V3        V4        V5        V6  \
Class                                                                    
0     113  -1.165722  1.485337  0.156418  0.924378  0.131746  0.996965   
      318  -0.216867  0.900896  1.502850  0.812492  0.193952 -0.031488   
      73   -0.549626  0.418949  1.729833  0.203065 -0.187012  0.253878   
      548  -0.421336  0.845373 -0.180053 -1.194077  2.737800  3.293114   
      582   1.040781  0.109569  0.357987  1.118998 -0.105373 -0.056837   
...              ...       ...       ...       ...       ...       ...   
1     1367 -1.289945 -1.880754  1.955028  0.986039  1.927335  0.443648   
      1135  0.994701  0.251793  0.235143  0.373824  0.117918 -0.096451   
      972   1.219374  0.302866  0.225832  0.555051 -0.128276 -0.513461   
      1144 -2.080222 -2.546029  2.361856  1.227774  2.377832  0.467554   
      1340 -1.078457 -0.007520  1.842924  0.316904  1.176084 -0.691357   

                  V7        V8       

In [16]:
print('Percentage of No Frauds: {}%'.format(round(stratified_df.Class.value_counts()[0]/len(stratified_df) * 100.0,2)))
print('Percentage of Frauds: {}%'.format(round(stratified_df.Class.value_counts()[1]/len(stratified_df) * 100.0,2)))

Percentage of No Frauds: 50.0%
Percentage of Frauds: 50.0%


# Cluster sampling

In [17]:
# Cluster sampling
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42).fit(new_df.drop('Class', axis=1))
cluster_assignments = kmeans.labels_

# Select the clusters you want to include in the sample
selected_clusters = [0, 2, 4, 5, 8]

cluster_series = pd.Series(cluster_assignments)

# Create the new DataFrame containing only the rows from the selected clusters
df_cluster_sample = new_df[cluster_series.isin(selected_clusters)]

# Print the resulting DataFrame
print(df_cluster_sample)


            V1        V2        V3        V4        V5        V6        V7  \
0    -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1    -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
7    -0.894286  0.286157 -0.113192 -0.271526  2.669599  3.721818  0.370145   
9     1.449044 -1.176339  0.913860 -1.375667 -1.971383 -0.629152 -1.423236   
10    0.384978  0.616109 -0.874300 -0.094019  2.924584  3.317027  0.470455   
...        ...       ...       ...       ...       ...       ...       ...   
1517 -2.490681 -2.806488  1.819278  1.679798  1.944063 -0.185374 -0.965573   
1518 -1.833034 -0.865950  1.648558 -0.008902  1.476326  0.027652 -0.588693   
1520  1.200905  0.279694  0.185773  0.481465  0.002622 -0.217689 -0.055027   
1522  0.066989  0.232210  0.653516  0.157479  0.141278 -0.761332  0.159125   
1524 -0.019976  0.231695  0.682233  0.116121  0.187355 -0.736707  0.168816   

            V8        V9       V10  ...       V21       V22    

In [18]:
print('Percentage of No Frauds: {}%'.format(round(df_cluster_sample.Class.value_counts()[0]/len(df_cluster_sample) * 100.0,2)))
print('Percentage of Frauds: {}%'.format(round(df_cluster_sample.Class.value_counts()[1]/len(df_cluster_sample) * 100.0,2)))

Percentage of No Frauds: 47.08%
Percentage of Frauds: 52.92%


# **Convenience sampling**

In [19]:
# Convenience sampling
convenience_sample = pd.concat([new_df.head(380), new_df.tail(380)])
print(convenience_sample)

            V1        V2        V3        V4        V5        V6        V7  \
0    -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1    -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
2    -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
3    -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   
4    -0.425966  0.960523  1.141109 -0.168252  0.420987 -0.029728  0.476201   
...        ...       ...       ...       ...       ...       ...       ...   
1521  0.654212  0.459679  0.380802  0.397261  0.243939 -0.395308  0.197239   
1522  0.066989  0.232210  0.653516  0.157479  0.141278 -0.761332  0.159125   
1523  0.785682  0.430019  0.361827  0.463486  0.102925 -0.545293  0.170004   
1524 -0.019976  0.231695  0.682233  0.116121  0.187355 -0.736707  0.168816   
1525 -0.208624  0.507982  0.815037  0.134036  0.863350 -0.073145  0.460450   

            V8        V9       V10  ...       V21       V22    

In [20]:
print('Percentage of No Frauds: {}%'.format(round(convenience_sample.Class.value_counts()[0]/len(convenience_sample) * 100.0,2)))
print('Percentage of Frauds: {}%'.format(round(convenience_sample.Class.value_counts()[1]/len(convenience_sample) * 100.0,2)))

Percentage of No Frauds: 50.0%
Percentage of Frauds: 50.0%


In [21]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
 !pip uninstall scikit-learn
!pip install scikit-learn==0.23.2

Found existing installation: scikit-learn 0.23.2
Uninstalling scikit-learn-0.23.2:
  Would remove:
    /usr/local/lib/python3.8/dist-packages/scikit_learn-0.23.2.dist-info/*
    /usr/local/lib/python3.8/dist-packages/scikit_learn.libs/libgomp-3300acd3.so.1.0.0
    /usr/local/lib/python3.8/dist-packages/sklearn/*
Proceed (Y/n)? y
  Successfully uninstalled scikit-learn-0.23.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


## **COMPARING MODELS ON EVERY SAMPLING TECHNIQUES**

In [23]:
# Use Pycaret to compare models on each sampling technique
from pycaret.classification import *
 

sampling_techniques = [sampled_df, systematic_df, stratified_df, df_cluster_sample, convenience_sample]

for sample in sampling_techniques:
    setup(data=sample, target='Class', silent=True)
    cm = compare_models() 

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9962,0.9963,0.187
qda,Quadratic Discriminant Analysis,0.9962,1.0,0.9925,1.0,0.9962,0.9925,0.9926,0.024
rf,Random Forest Classifier,0.9943,0.9999,0.9963,0.9927,0.9944,0.9887,0.9889,0.46
lightgbm,Light Gradient Boosting Machine,0.9887,0.9999,0.9963,0.9821,0.9889,0.9774,0.978,0.139
gbc,Gradient Boosting Classifier,0.9868,0.9997,0.9926,0.9819,0.9869,0.9737,0.9743,0.337
ada,Ada Boost Classifier,0.9831,0.9989,0.9962,0.9715,0.9835,0.9661,0.9668,0.165
dt,Decision Tree Classifier,0.9567,0.9568,0.9815,0.9359,0.9578,0.9134,0.9151,0.031
lr,Logistic Regression,0.9473,0.9663,1.0,0.9075,0.9505,0.8947,0.9016,0.037
svm,SVM - Linear Kernel,0.9437,0.0,0.9704,0.926,0.9438,0.8876,0.8954,0.021
knn,K Neighbors Classifier,0.9228,0.9833,1.0,0.8691,0.9289,0.846,0.858,0.046


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=3676, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


In [None]:
TREE CLASSIFIER ON CLUSTER SAMPLING IS GIVING BETTER RESULT AMONG THESE MODELS