In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## Loading Data

In [2]:
data = pd.read_csv('Creditcard_data.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Understanding the Data

In [3]:
data.shape

(772, 31)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float64
 22  V2

In [5]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
data['Class'].value_counts()

0    763
1      9
Name: Class, dtype: int64

In [7]:
X = data.drop('Class', axis = 1)
y = data['Class']

## Data balancing by SMOTE

In [8]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [9]:
data_new = X.join(y)
data_new

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.620000,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.690000,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.660000,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.500000,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.990000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,424,0.825484,0.432735,0.359835,0.480520,0.073172,-0.586067,0.168018,-0.082626,-0.035734,...,-0.229151,-0.660525,0.136185,-0.279953,-0.366595,0.087464,0.070497,0.103661,1.180502,1
1522,150,0.856967,0.528972,0.089149,1.061794,-0.388284,-1.110003,-0.205969,-0.024955,-0.277796,...,-0.197807,-0.743699,0.061896,0.337282,0.196820,0.103978,0.008384,0.011462,2.389906,1
1523,428,-1.906013,1.495987,-0.626412,2.878220,-0.085324,-1.272311,-1.534749,0.937281,-1.998838,...,0.384909,0.057815,-0.404389,0.339329,0.146238,-0.007548,0.153852,-0.138137,0.293478,1
1524,492,-1.130091,-0.146836,1.877878,0.362997,1.248091,-0.619367,0.315994,0.021391,0.065244,...,0.133425,0.453534,-0.153280,0.151359,0.276140,-0.206194,-0.108248,-0.134297,1.094176,1


In [10]:
data_new['Class'].value_counts()

0    763
1    763
Name: Class, dtype: int64

In [11]:
import math

confidence_level = 1.96 
margin_of_error = 0.05  
n = math.ceil((confidence_level**2 * 0.5 * 0.5) / margin_of_error**2)
print("Sample size:", n)

Sample size: 385


## CREATING SAMPLES

In [12]:
#1. Simple Random Sampling

random_sample = data_new.sample(n = math.ceil(n), random_state = 42 , replace = False)
print("Simple Random Sample: ", random_sample.head())

Simple Random Sample:        Time        V1        V2        V3        V4        V5        V6  \
1439   485 -0.619418  1.199343 -0.700949  2.429655 -0.444234 -1.256283   
76      49 -0.549626  0.418949  1.729833  0.203065 -0.187012  0.253878   
1010   468 -0.836482  0.396184  1.680761  0.204092  0.910246 -0.908106   
660    499  1.255439  0.307729  0.292700  0.699873 -0.428876 -1.088456   
1132   527 -2.030893 -2.514727  2.427062  1.173450  2.429962  0.546022   

            V7        V8        V9  ...       V21       V22       V23  \
1439 -1.289498  0.632036 -1.449705  ...  0.135947 -0.407572 -0.184080   
76    0.500894  0.251256 -0.227985  ...  0.115062  0.418529 -0.065133   
1010  0.845771 -0.158530 -0.134679  ...  0.051501  0.234628 -0.241766   
660   0.043840 -0.167739  0.128854  ... -0.294795 -0.882126  0.136846   
1132 -2.039360  0.763454  0.923053  ...  0.429408  1.173298  0.329184   

           V24       V25       V26       V27       V28     Amount  Class  
1439  0.331071  0.

In [13]:
#2. Stratified Random Sampling

stratify_column = 'Class'
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Class', axis = 1), 
    data['Class'], 
    test_size = 0.2, 
    random_state = 42, 
    stratify = data[stratify_column]
)
print("Stratified Random Sample - Training Set:", X_train.join(y_train).head())

print("\nStratified Random Sample - Testing Set:", X_test.join(y_test).head())

Stratified Random Sample - Training Set:      Time        V1        V2        V3        V4        V5        V6  \
378   275 -0.363519  0.055464  1.857571 -1.085421 -0.981918 -0.473025   
52     36 -1.169422  1.158314  1.406800  0.860189 -0.103810  0.122035   
674   510  1.163271  0.141760  0.124579  0.958551 -0.159554 -0.461529   
582   434 -0.679293  1.120837  1.319394  1.249827  1.147786 -0.086534   
358   265 -0.293839 -0.044369  1.093146 -1.576473 -0.107492 -0.791217   

           V7        V8        V9  ...       V21       V22       V23  \
378  0.210565  0.022670 -1.641689  ... -0.408214 -0.907776  0.248037   
52   0.264451 -0.108767 -0.181977  ...  0.024498 -0.120153  0.212986   
674  0.090759 -0.023257 -0.125187  ...  0.066320  0.089322 -0.169921   
582  1.001436 -0.039752 -1.374497  ...  0.067521  0.030112 -0.296954   
358  0.291465 -0.093164 -1.406366  ... -0.235571 -0.286207  0.069303   

          V24       V25       V26       V27       V28  Amount  Class  
378  0.492936 -0

In [14]:
#3. Systematic Random Sampling

population_size = data_new.shape[0]
k = int(population_size / math.ceil(n))
start = np.random.randint(0, k)
systematic_sample = data_new.iloc[start::k]
print("Systematic Random Sample:", systematic_sample.head())


Systematic Random Sample:     Time        V1        V2        V3        V4        V5        V6  \
0      0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
3      1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203   
6      4  1.229658  0.141004  0.045371  1.202613  0.191881  0.272708   
9      9 -0.338262  1.119593  1.044367 -0.222187  0.499361 -0.246761   
12    10  1.249999 -1.221637  0.383930 -1.234899 -1.485419 -0.753230   

          V7        V8        V9  ...       V21       V22       V23       V24  \
0   0.239599  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928   
3   0.237609  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575   
6  -0.005159  0.081213  0.464960  ... -0.167716 -0.270710 -0.154104 -0.780055   
9   0.651583  0.069539 -0.736727  ... -0.246914 -0.633753 -0.120794 -0.385050   
12 -0.689405 -0.227487 -2.094011  ... -0.231809 -0.483285  0.084668  0.392831   

         V25       V26       V27       V28  Amount  Cl

In [15]:
#4. Cluster Sampling

num_clusters = 5 
data_new['Cluster'] = np.random.randint(1, num_clusters + 1, data_new.shape[0])
selected_clusters = np.random.choice(data_new['Cluster'].unique(), size=math.ceil(num_clusters / 2), replace=False)
cluster_sample = data_new[data_new['Cluster'].isin(selected_clusters)]
cluster_sample = cluster_sample.drop('Cluster', axis=1)
print("Cluster Sample:", cluster_sample.head())

Cluster Sample:    Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
6     4  1.229658  0.141004  0.045371  1.202613  0.191881  0.272708 -0.005159   
8     7 -0.894286  0.286157 -0.113192 -0.271526  2.669599  3.721818  0.370145   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
6  0.081213  0.464960  ... -0.167716 -0.270710 -0.154104 -0.780055  0.750137   
8  0.851084 -0.392048  ... -0.073425 -0.268092 -0.204233  1.011592  0.373205   

        V26     

In [16]:
#5. Boostrap sampling

data_bootstrap_sample = data_new.sample(n = math.ceil(n), replace = True, random_state = 42)
data_bootstrap_sample.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,Cluster
1126,221,-0.495785,0.88533,-0.040083,1.041513,0.50197,-0.148931,-0.379861,0.438288,-0.754688,...,-0.299159,0.003968,-0.962091,-1.046993,0.099854,0.238764,0.141059,0.753766,1,3
1459,283,-0.790778,1.268837,-0.794205,2.588354,-0.457904,-1.274582,-1.418133,0.711578,-1.573603,...,-0.375254,-0.212161,0.3284,0.117635,0.142381,0.139801,-0.06899,1.147332,1,5
860,484,-2.113897,-0.878234,0.984616,2.179264,1.376856,-0.140571,-2.265528,1.007731,-0.397143,...,0.747964,0.020367,-0.428998,-0.123381,0.612592,0.015656,-0.160957,0.95455,1,4
1294,368,-1.94319,1.735284,-1.290929,3.397271,-0.314115,-1.166979,-2.099054,1.197966,-2.360631,...,-0.088707,-0.36989,0.059682,-0.177238,0.161996,0.256598,-0.085509,0.153139,1,2
1130,511,-1.331706,0.226161,1.465208,-0.235303,0.943597,-0.59762,0.575275,-0.008577,-0.05966,...,0.045622,-0.259915,0.075205,0.029498,-0.108161,-0.224305,-0.254146,1.0,1,1


## Model Evaluation

#### 1. Logistic Regression

In [17]:
#1. Simple Random Sampling

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X_random_sample = random_sample.drop('Class', axis = 1)
y_random_sample = random_sample['Class']
X_random_scaled = scaler.fit_transform(X_random_sample)
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X_random_scaled, y_random_sample, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(X_train_random, y_train_random)
y_pred = model.predict(X_test_random)
random_accuracy = accuracy_score(y_test_random, y_pred)
print("Accuracy for Simple Random Sampling using Logistic Regression:", random_accuracy)

Accuracy for Simple Random Sampling using Logistic Regression: 0.8793103448275862


In [18]:
#2. Stratified Random Sampling

stratify_column = 'Class'
X_stratified_sample, _, y_stratified_sample, _ = train_test_split(
    data_new.drop('Class', axis=1),
    data_new['Class'],
    test_size=0.3,
    random_state=42,
    stratify=data_new[stratify_column]
)
X_stratified_scaled = scaler.fit_transform(X_stratified_sample)
X_train_stratified, X_test_stratified, y_train_stratified, y_test_stratified = train_test_split(
    X_stratified_scaled, y_stratified_sample, test_size=0.3, random_state=42
)
model_stratified = LogisticRegression(random_state=42)
model_stratified.fit(X_train_stratified, y_train_stratified)
y_pred_stratified = model_stratified.predict(X_test_stratified)
stratified_accuracy = accuracy_score(y_test_stratified, y_pred_stratified)
print("Accuracy for Stratified Random Sampling using Logistic Regression:", stratified_accuracy)

Accuracy for Stratified Random Sampling using Logistic Regression: 0.9158878504672897


In [19]:
#3. Systematic Random Sampling

X_systematic_sample = systematic_sample.drop('Class', axis = 1)
y_systematic_sample = systematic_sample['Class']
X_systematic_scaled = scaler.fit_transform(X_systematic_sample)
X_train_systematic, X_test_systematic, y_train_systematic, y_test_systematic = train_test_split(X_systematic_scaled, y_systematic_sample, test_size=0.3, random_state=42)

model.fit(X_train_systematic, y_train_systematic)
y_pred = model.predict(X_test_systematic)
systematic_accuracy = accuracy_score(y_test_systematic, y_pred)
print("Accuracy for Systematic Random Sampling using Logistic Regression:", systematic_accuracy)

Accuracy for Systematic Random Sampling using Logistic Regression: 0.9084967320261438


In [20]:
#4. Cluster Sampling

X_cluster_sample = cluster_sample.drop('Class', axis = 1)
y_cluster_sample = cluster_sample['Class']
X_cluster_scaled = scaler.fit_transform(X_cluster_sample)
X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = train_test_split(X_cluster_scaled, y_cluster_sample, test_size=0.3, random_state=42)

model.fit(X_train_cluster, y_train_cluster)
y_pred = model.predict(X_test_cluster)
cluster_accuracy = accuracy_score(y_test_cluster, y_pred)
print("Accuracy for Cluster Sampling using Logistic Regression :", cluster_accuracy)

Accuracy for Cluster Sampling using Logistic Regression : 0.9184397163120568


In [21]:
#5. Bootstrap Sampling

X_bootstrap_sample = data_bootstrap_sample.drop('Class', axis = 1)
y_bootstrap_sample = data_bootstrap_sample['Class']
X_bootstrap_scaled = scaler.fit_transform(X_bootstrap_sample)
X_train_bootstrap, X_test_bootstrap, y_train_bootstrap, y_test_bootstrap = train_test_split(X_bootstrap_scaled, y_bootstrap_sample, test_size=0.3, random_state=42)

model.fit(X_train_bootstrap, y_train_bootstrap)
y_pred = model.predict(X_test_bootstrap)

bootstrap_accuracy = accuracy_score(y_test_bootstrap, y_pred)
print("Accuracy for Bootstrap sampling using Logistic Regression :", bootstrap_accuracy)

Accuracy for Bootstrap sampling using Logistic Regression : 0.9568965517241379


#### 2. Random Forest

In [22]:
#1. Simple Random Sampling

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(X_train_random, y_train_random)
y_pred = model.predict(X_test_random)
random_accuracy = accuracy_score(y_test_random, y_pred)
print("Accuracy of Simple Random Sampling using Random Forest:", random_accuracy)

Accuracy of Simple Random Sampling using Random Forest: 0.9913793103448276


In [23]:
#2. Stratified Random Sampling

model_stratified = RandomForestClassifier(n_estimators=20, random_state=42)
model_stratified.fit(X_train_stratified, y_train_stratified)
y_pred_stratified = model_stratified.predict(X_test_stratified)
stratified_accuracy_rf = accuracy_score(y_test_stratified, y_pred_stratified)
print("Accuracy for Stratified Random Sampling using Random Forest:", stratified_accuracy)

Accuracy for Stratified Random Sampling using Random Forest: 0.9158878504672897


In [24]:
#3. Systematic Random Sampling

model_systematic = RandomForestClassifier(n_estimators=20, random_state=42)
model_systematic.fit(X_train_systematic, y_train_systematic)
y_pred_systematic = model_systematic.predict(X_test_systematic)
systematic_accuracy = accuracy_score(y_test_systematic, y_pred_systematic)
print("Accuracy for Systematic Random Sampling using Random Forest:", systematic_accuracy)

Accuracy for Systematic Random Sampling using Random Forest: 0.9934640522875817


In [25]:
#4. Cluster Sampling

model_cluster = RandomForestClassifier(n_estimators=20, random_state=42)
model_cluster.fit(X_train_cluster, y_train_cluster)
y_pred_cluster = model_cluster.predict(X_test_cluster)
cluster_accuracy = accuracy_score(y_test_cluster, y_pred_cluster)
print("Accuracy for Cluster Sampling using Random Forest:", cluster_accuracy)

Accuracy for Cluster Sampling using Random Forest: 1.0


In [26]:
#4. Bootstrap Sampling

model_bootstrap = RandomForestClassifier(n_estimators=20, random_state=42)
model_bootstrap.fit(X_train_bootstrap, y_train_bootstrap)
y_pred_bootstrap = model_bootstrap.predict(X_test_bootstrap)
bootstrap_accuracy = accuracy_score(y_test_bootstrap, y_pred_bootstrap)
print("Accuracy for Boostrap Sampling using Random Forest:", bootstrap_accuracy)

Accuracy for Boostrap Sampling using Random Forest: 1.0


#### 3. KNN

In [27]:
from sklearn.neighbors import KNeighborsClassifier

#1. Simple Random Sampling

model_knn_random = KNeighborsClassifier(n_neighbors=5)
model_knn_random.fit(X_train_random, y_train_random)
y_pred_knn_random = model_knn_random.predict(X_test_random)
random_accuracy = accuracy_score(y_test_random, y_pred_knn_random)
print("Accuracy for Simple Random Sampling using KNN:", random_accuracy)

Accuracy for Simple Random Sampling using KNN: 0.853448275862069


In [28]:
#2. Stratified Random Sampling 

model_knn_stratified = KNeighborsClassifier(n_neighbors=5)
model_knn_stratified.fit(X_train_stratified, y_train_stratified)
y_pred_knn_stratified = model_knn_stratified.predict(X_test_stratified)
stratified_accuracy = accuracy_score(y_test_stratified, y_pred_knn_stratified)
print("Accuracy for Stratified Random Sampling using KNN:", stratified_accuracy)

Accuracy for Stratified Random Sampling using KNN: 0.8753894080996885


In [29]:
#3. Systematic Random Sampling 

model_knn_systematic = KNeighborsClassifier(n_neighbors=5)
model_knn_systematic.fit(X_train_systematic, y_train_systematic)
y_pred_knn_systematic = model_knn_systematic.predict(X_test_systematic)
systematic_accuracy = accuracy_score(y_test_systematic, y_pred_knn_systematic)
print("Accuracy for Systematic Random Sampling using KNN:", systematic_accuracy)


Accuracy for Systematic Random Sampling using KNN: 0.8562091503267973


In [30]:
#4. Cluster Sampling 

model_knn_cluster = KNeighborsClassifier(n_neighbors=5)
model_knn_cluster.fit(X_train_cluster, y_train_cluster)
y_pred_knn_cluster = model_knn_cluster.predict(X_test_cluster)
cluster_accuracy = accuracy_score(y_test_cluster, y_pred_knn_cluster)
print("Accuracy for Cluster Sampling using KNN:", cluster_accuracy)

Accuracy for Cluster Sampling using KNN: 0.9042553191489362


In [31]:
#5. Bootstrap Sampling

model_knn_bootstrap = KNeighborsClassifier(n_neighbors=5)
model_knn_bootstrap.fit(X_train_bootstrap, y_train_bootstrap)
y_pred_knn_bootstrap = model_knn_bootstrap.predict(X_test_bootstrap)
bootstrap_accuracy = accuracy_score(y_test_bootstrap, y_pred_knn_bootstrap)
print("Accuracy for Bootstrap Sampling using KNN:", bootstrap_accuracy)

Accuracy for Bootstrap Sampling using KNN: 0.8879310344827587


#### 4. XGBoost Classifier

In [32]:
from xgboost import XGBClassifier

#1. Simple Random Sampling

model_xgb_random = XGBClassifier(random_state=42)
model_xgb_random.fit(X_train_random, y_train_random)
y_pred_xgb_random = model_xgb_random.predict(X_test_random)
xgb_accuracy = accuracy_score(y_test_random, y_pred_xgb_random)
print("Accuracy for Simple Random Sampling using XGBoost:", xgb_accuracy)

Accuracy for Simple Random Sampling using XGBoost: 0.9396551724137931


In [33]:
#2. Stratified Random Sampling

model_xgb_stratified = XGBClassifier(random_state=42)
model_xgb_stratified.fit(X_train_stratified, y_train_stratified)
y_pred_xgb_stratified = model_xgb_stratified.predict(X_test_stratified)
stratified_accuracy = accuracy_score(y_test_stratified, y_pred_xgb_stratified)
print("Accuracy for Stratified Random Sampling using XGBoost:", stratified_accuracy)

Accuracy for Stratified Random Sampling using XGBoost: 0.9875389408099688


In [34]:
#3. Systematic Random Sampling

model_xgb_systematic = XGBClassifier(random_state=42)
model_xgb_systematic.fit(X_train_systematic, y_train_systematic)
y_pred_xgb_systematic = model_xgb_systematic.predict(X_test_systematic)
systematic_accuracy = accuracy_score(y_test_systematic, y_pred_xgb_systematic)
print("Accuracy for Systematic Random Sampling using XGBoost:", systematic_accuracy)

Accuracy for Systematic Random Sampling using XGBoost: 0.9738562091503268


In [35]:
#4. Cluster Sampling

model_xgb_cluster = XGBClassifier(random_state=42)
model_xgb_cluster.fit(X_train_cluster, y_train_cluster)
y_pred_xgb_cluster = model_xgb_cluster.predict(X_test_cluster)
cluster_accuracy = accuracy_score(y_test_cluster, y_pred_xgb_cluster)
print("Accuracy for Cluster Sampling using XGBoost:", cluster_accuracy)

Accuracy for Cluster Sampling using XGBoost: 0.9787234042553191


In [36]:
#5. Bootstrap Sampling

model_xgb_bootstrap = XGBClassifier(random_state=42)
model_xgb_bootstrap.fit(X_train_bootstrap, y_train_bootstrap)
y_pred_xgb_bootstrap = model_xgb_bootstrap.predict(X_test_bootstrap)
bootstrap_accuracy = accuracy_score(y_test_bootstrap, y_pred_xgb_bootstrap)
print("Accuracy for Bootstrap Sampling using XGBoost:", bootstrap_accuracy)

Accuracy for Bootstrap Sampling using XGBoost: 0.9568965517241379


#### 5. SVC (Support Vector Classifier)

In [37]:
from sklearn.svm import SVC

#1. Simple Random Sampling

model_svc_random = SVC(random_state=42)
model_svc_random.fit(X_train_random, y_train_random)
y_pred_svc_random = model_svc_random.predict(X_test_random)
random_accuracy = accuracy_score(y_test_random, y_pred_svc_random)
print("Accuracy for Simple Random Sampling using SVC:", random_accuracy)

Accuracy for Simple Random Sampling using SVC: 0.9224137931034483


In [38]:
#2. Stratified Random Sampling 

model_svc_stratified = SVC(random_state=42)
model_svc_stratified.fit(X_train_stratified, y_train_stratified)
y_pred_svc_stratified = model_svc_stratified.predict(X_test_stratified)
stratified_accuracy = accuracy_score(y_test_stratified, y_pred_svc_stratified)
print("Accuracy for Stratified Random Sampling using SVC:", stratified_accuracy)

Accuracy for Stratified Random Sampling using SVC: 0.9750778816199377


In [39]:
#3. Systematic Random Sampling 

model_svc_systematic = SVC(random_state=42)
model_svc_systematic.fit(X_train_systematic, y_train_systematic)
y_pred_svc_systematic = model_svc_systematic.predict(X_test_systematic)
systematic_accuracy = accuracy_score(y_test_systematic, y_pred_svc_systematic)
print("Accuracy for Systematic Random Sampling using SVC:", systematic_accuracy)

Accuracy for Systematic Random Sampling using SVC: 0.9411764705882353


In [40]:
#4. Cluster Sampling

model_svc_cluster = SVC(random_state=42)
model_svc_cluster.fit(X_train_cluster, y_train_cluster)
y_pred_svc_cluster = model_svc_cluster.predict(X_test_cluster)
cluster_accuracy = accuracy_score(y_test_cluster, y_pred_svc_cluster)
print("Accuracy for Cluster Sampling using SVC:", cluster_accuracy)

Accuracy for Cluster Sampling using SVC: 0.9716312056737588


In [41]:
#5. Bootstrap Sampling

model_svc_bootstrap = SVC(random_state=42)
model_svc_bootstrap.fit(X_train_bootstrap, y_train_bootstrap)
y_pred_svc_bootstrap = model_svc_bootstrap.predict(X_test_bootstrap)
bootstrap_accuracy = accuracy_score(y_test_bootstrap, y_pred_svc_bootstrap)
print("Accuracy for Bootstrap Sampling using SVC:", bootstrap_accuracy)

Accuracy for Bootstrap Sampling using SVC: 0.9568965517241379
