In [60]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [61]:
df = pd.read_csv('Creditcard_data.csv')

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X_train.columns), pd.Series(y_resampled, name='Class')], axis=1)

print("Class distribution after SMOTE:\n", df_resampled['Class'].value_counts())

df_resampled.to_csv('resampled_data.csv', index=False)


Class distribution after SMOTE:
 0    609
1    609
Name: Class, dtype: int64


In [62]:
#1 Simple random sampling

In [63]:
df_resampled = pd.read_csv('resampled_data.csv')

random_samples = df_resampled.sample(n=5, random_state=42)

print("Randomly selected 5 samples:")
print(random_samples)



Randomly selected 5 samples:
      Time        V1        V2        V3        V4        V5        V6  \
541    103 -1.409009  0.563493  3.350717 -0.436893 -1.123467  0.256345   
259     15  1.492936 -1.029346  0.454795 -1.438026 -1.555434 -0.720961   
43     457 -0.469790 -0.009723  1.759079 -0.086506 -0.508940  0.757607   
1008   542 -1.401380  0.087852  1.088007 -0.505029  0.776751 -0.379084   
584    351  1.205444  0.008467  0.953782  1.141093 -0.491215  0.297303   

            V7        V8        V9  ...       V21       V22       V23  \
541  -0.253390 -0.795814  1.421954  ...  0.665585  0.170281 -0.231557   
259  -1.080664 -0.053127 -1.978682  ... -0.177650 -0.175074  0.040002   
43    0.256744  0.208422  0.414208  ... -0.030373  0.050121  0.248683   
1008  0.249410  0.101138  0.022450  ... -0.191622 -0.262781 -0.218040   
584  -0.503913  0.084948  0.796497  ... -0.103663 -0.046173 -0.123765   

           V24       V25       V26       V27       V28      Amount  Class  
541   1.067

In [64]:
X_samples = random_samples.drop('Class', axis=1)
y_samples = random_samples['Class']

X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=1),  # Set n_neighbors to a smaller value
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

overall_accuracies = {}

for model_name, model in models.items():
    # Training the model
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store accuracy for each model
    overall_accuracies[model_name] = accuracy

# Display overall accuracies for each model
print("Overall accuracies:")
for model_name, accuracy in overall_accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")


Overall accuracies:
Logistic Regression: 1.00
Random Forest: 1.00
KNN: 1.00
Naive Bayes: 1.00
Decision Tree: 1.00


In [65]:
#2 systematic sampling

In [66]:
df_resampled = pd.read_csv('resampled_data.csv')

np.random.seed(42)

sampling_interval = len(df_resampled) // 5

start_index = np.random.randint(0, sampling_interval)
systematic_samples = df_resampled.iloc[start_index::sampling_interval]

print("Systematically selected 5 samples:")
print(systematic_samples)



Systematically selected 5 samples:
      Time        V1        V2        V3        V4        V5        V6  \
102    344 -3.495984 -4.088420  2.024845 -0.740363  1.128135 -1.231702   
345    266  0.982539 -0.229085  0.003051  1.444009  0.200645  0.636756   
588    225 -0.608831  0.876837  2.495715  3.138674  0.161264 -0.107099   
831    471 -1.548626 -2.100421  2.166640  1.078007  2.068653  0.363186   
1074   261 -0.386075 -1.098853  1.303641  0.790142  1.247260  0.252076   

            V7        V8        V9  ...       V21       V22       V23  \
102  -0.086554  0.157807  1.677621  ...  0.361562 -0.173006  1.280446   
345   0.012166  0.128519 -0.361986  ... -0.315217 -1.347024 -0.033817   
588   0.515854 -0.138226 -1.035070  ... -0.262866 -0.439237 -0.006775   
831  -1.805190  0.650746  0.830583  ...  0.323880  0.913847  0.274270   
1074 -1.082861  0.432697  0.344739  ...  0.094627  0.267859  0.198437   

           V24       V25       V26       V27       V28      Amount  Class  
102  

In [67]:
X_samples = systematic_samples.drop('Class', axis=1)
y_samples = systematic_samples['Class']

X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=1),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

overall_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    overall_accuracies[model_name] = accuracy

print("Overall accuracies:")
for model_name, accuracy in overall_accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")


Overall accuracies:
Logistic Regression: 1.00
Random Forest: 0.00
KNN: 1.00
Naive Bayes: 1.00
Decision Tree: 1.00


In [68]:
#3 Cluster sampling

In [69]:
df_resampled = pd.read_csv('resampled_data.csv')

np.random.seed(42)

unique_clusters = df_resampled['Class'].unique()

num_clusters = len(unique_clusters)

selected_clusters = np.random.choice(unique_clusters, num_clusters, replace=False)

cluster_samples = pd.DataFrame()

for cluster in selected_clusters:
    cluster_data = df_resampled[df_resampled['Class'] == cluster]
    cluster_samples = pd.concat([cluster_samples, cluster_data.sample(frac=0.2, random_state=42)])

print("Cluster-sampled 5 samples:")
print(cluster_samples)



Cluster-sampled 5 samples:
      Time        V1        V2        V3        V4        V5        V6  \
900    222  1.255554  0.353584  0.303499  0.692550 -0.368350 -1.069623   
974    476 -2.133580 -0.597351  0.727131  2.359753  1.188388 -0.268196   
1076   559  0.903852  0.370155  0.539111  0.608402 -0.143416 -1.040530   
1176   476 -2.952155 -2.932508  1.095367  2.082385  1.329049 -1.010687   
727    504 -1.427409 -0.949035  2.079148  0.628407  1.662714 -0.204843   
...    ...       ...       ...       ...       ...       ...       ...   
381    290  1.355157 -1.318940  0.166357 -1.574822 -1.279267 -0.182794   
255     88  1.287226 -0.824683  1.346423 -0.525628 -1.833007 -0.477715   
215    221 -1.177731  0.318501  1.727123 -0.549578  0.737292 -0.050963   
537    333 -2.977214  0.781748  2.881724 -1.627798 -1.368067  1.656876   
204    394 -0.553092  1.667591 -0.047357  0.514249  0.589388 -0.635411   

            V7        V8        V9  ...       V21       V22       V23  \
900   0.088

In [70]:
X_samples = cluster_samples.drop('Class', axis=1)
y_samples = cluster_samples['Class']

X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

overall_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    overall_accuracies[model_name] = accuracy

print("Overall accuracies:")
for model_name, accuracy in overall_accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Overall accuracies:
Logistic Regression: 0.90
Random Forest: 0.96
KNN: 0.78
Naive Bayes: 0.94
Decision Tree: 0.94


In [71]:
#4 Stratified Sampling

In [72]:
df_resampled = pd.read_csv('resampled_data.csv')

seed = 42

unique_strata = df_resampled['Class'].unique()

stratified_samples = pd.DataFrame()

for stratum in unique_strata:
    stratum_data = df_resampled[df_resampled['Class'] == stratum]

    stratum_train, stratum_test = train_test_split(stratum_data, test_size=0.2, random_state=seed)

    stratified_samples = pd.concat([stratified_samples, stratum_test])

print("Stratified-sampled 5 samples:")
print(stratified_samples)


Stratified-sampled 5 samples:
      Time        V1        V2        V3        V4        V5        V6  \
297     13 -0.436905  0.918966  0.924591 -0.727219  0.915679 -0.127867   
371    446 -1.146103  1.350274  0.907209 -0.040682 -0.242920 -1.099859   
473    164  1.212057  0.246405  0.309166  0.584744 -0.381581 -0.764968   
574    164 -0.433211  1.020835  2.019730  3.003261  0.031308  0.187063   
122     60  1.107029  0.216441  0.538378  1.476398 -0.251942 -0.340680   
...    ...       ...       ...       ...       ...       ...       ...   
984    334 -0.462834 -1.151278  1.444666  0.928941  1.123609 -0.191977   
858    460 -2.076061  1.170082 -0.458382  2.081655  0.071808 -0.959373   
818     27  1.206500  0.285689  0.198064  0.505039 -0.040183 -0.311770   
1140   520 -1.008141 -1.490639  0.718630  1.532451  0.547021 -1.066081   
808    467 -2.045503  1.068909 -0.309391  1.833707  0.148666 -0.898925   

            V7        V8        V9  ...       V21       V22       V23  \
297   0.

In [73]:
X_samples = stratified_samples.drop('Class', axis=1)
y_samples = stratified_samples['Class']

X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=seed)

models = {
    'Logistic Regression': LogisticRegression(random_state=seed),
    'Random Forest': RandomForestClassifier(random_state=seed),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=seed)
}

overall_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    overall_accuracies[model_name] = accuracy

print("Overall accuracies:")
for model_name, accuracy in overall_accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Overall accuracies:
Logistic Regression: 0.96
Random Forest: 1.00
KNN: 0.86
Naive Bayes: 0.84
Decision Tree: 0.98


In [74]:
#5 Bootstrap Sampling

In [75]:
df_resampled = pd.read_csv('resampled_data.csv')

np.random.seed(42)

bootstrap_samples = pd.DataFrame()

for _ in range(5):
    bootstrap_sample = df_resampled.sample(frac=1, replace=True, random_state=42)
    bootstrap_samples = pd.concat([bootstrap_samples, bootstrap_sample])

print("Bootstrap-sampled 5 samples:")
print(bootstrap_samples)


Bootstrap-sampled 5 samples:
      Time        V1        V2        V3        V4        V5        V6  \
1126   503 -1.220717  0.273467  1.541082 -0.120366  0.949864 -0.681046   
860    478 -1.997840  0.911104 -0.077002  1.446969  0.268547 -0.804640   
1130   570  1.188379  0.395569  0.269693  0.754876 -0.360985 -1.074455   
1095   196 -0.503261  1.081685 -0.692829  2.165364 -0.221627 -0.732617   
1044   485 -0.896104  0.397705  1.720145  0.190106  0.947011 -0.903440   
...    ...       ...       ...       ...       ...       ...       ...   
882    529 -1.999963 -2.489605  2.464195  1.135908  2.458454  0.592218   
687     46  1.216570  0.299125  0.219783  0.544157 -0.109088 -0.469529   
826    343 -0.531053 -1.210912  1.490028  0.938306  1.182986 -0.157096   
726    295 -0.416120  0.149049  0.726486 -0.158070  0.532245 -0.197277   
593    282 -0.426030  1.041184  1.063793 -0.407921  0.077615 -0.810862   

            V7        V8        V9  ...       V21       V22       V23  \
1126  0.6

In [76]:
X_samples = bootstrap_samples.drop('Class', axis=1)
y_samples = bootstrap_samples['Class']

X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=seed)

models = {
    'Logistic Regression': LogisticRegression(random_state=seed),
    'Random Forest': RandomForestClassifier(random_state=seed),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=seed)
}

overall_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    overall_accuracies[model_name] = accuracy

print("Overall accuracies:")
for model_name, accuracy in overall_accuracies.items():
    print(f"{model_name}: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Overall accuracies:
Logistic Regression: 0.94
Random Forest: 1.00
KNN: 0.99
Naive Bayes: 0.93
Decision Tree: 1.00


In [77]:
accuracy_data = {
    'Logistic Regression': {
        'Simple Random Sampling': 1.00,
        'Systematic Sampling': 1.00,
        'Cluster Sampling': 0.90,
        'Stratified Sampling': 0.96,
        'Bootstrap Sampling': 0.94
    },
    'Random Forest': {
        'Simple Random Sampling': 1.00,
        'Systematic Sampling': 0.00,
        'Cluster Sampling': 0.96,
        'Stratified Sampling': 1.00,
        'Bootstrap Sampling': 1.00
    },
    'KNN': {
        'Simple Random Sampling': 1.00,
        'Systematic Sampling': 1.00,
        'Cluster Sampling': 0.78,
        'Stratified Sampling': 0.86,
        'Bootstrap Sampling': 0.99
    },
    'Naive Bayes': {
        'Simple Random Sampling': 1.00,
        'Systematic Sampling': 1.00,
        'Cluster Sampling': 0.94,
        'Stratified Sampling': 0.84,
        'Bootstrap Sampling': 0.93
    },
    'Decision Tree': {
        'Simple Random Sampling': 1.00,
        'Systematic Sampling': 1.00,
        'Cluster Sampling': 0.94,
        'Stratified Sampling': 0.98,
        'Bootstrap Sampling': 1.00
    }
}

accuracy_table = pd.DataFrame(accuracy_data)

print("Accuracy Table:")
print(accuracy_table)


Accuracy Table:
                        Logistic Regression  Random Forest   KNN  Naive Bayes  \
Simple Random Sampling                 1.00           1.00  1.00         1.00   
Systematic Sampling                    1.00           0.00  1.00         1.00   
Cluster Sampling                       0.90           0.96  0.78         0.94   
Stratified Sampling                    0.96           1.00  0.86         0.84   
Bootstrap Sampling                     0.94           1.00  0.99         0.93   

                        Decision Tree  
Simple Random Sampling           1.00  
Systematic Sampling              1.00  
Cluster Sampling                 0.94  
Stratified Sampling              0.98  
Bootstrap Sampling               1.00  


In [78]:
accuracy_table.to_csv('accuracy_table.csv')
print("Accuracy table has been saved to 'accuracy_table.csv'")
