In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('/content/Creditcard_data.csv')
print(data.head())
print(data.info())

# Separate features and target variable
X = data.drop("Class", axis=1)
y = data["Class"]

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=10)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Display class distributions
print("Original class distribution:")
print(y.value_counts())
print("Balanced class distribution:")
print(y_balanced.value_counts())

# Define sample size
sample_size = 1000

# Generate different samples
sampled_data = {
    "Technique1": X_balanced.sample(n=sample_size, random_state=15),
    "Technique2": X_balanced.sample(n=sample_size, random_state=25),
    "Technique3": X_balanced.iloc[::len(X_balanced) // sample_size, :],
    "Technique4": X_balanced.sample(n=sample_size, random_state=35),
    "Technique5": X_balanced.sample(n=sample_size, random_state=45),
}

# Link samples with their target labels
sample_targets = {
    name: (sample, y_balanced.loc[sample.index])
    for name, sample in sampled_data.items()
}

# Define alternative models
ml_models = {
    "NaiveBayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "Perceptron": Perceptron(random_state=42),
    "GradBoost": GradientBoostingClassifier(random_state=42),
}

# Store results
accuracy_results = {}

# Train and evaluate models
for technique_name, (X_sample, y_sample) in sample_targets.items():
    print(f"Processing sampling technique: {technique_name}...")
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=20)

    for model_name, model in ml_models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions)

        if technique_name not in accuracy_results:
            accuracy_results[technique_name] = {}
        accuracy_results[technique_name][model_name] = acc

# Convert results to DataFrame
accuracy_matrix = pd.DataFrame.from_dict(accuracy_results, orient='index')

print("Accuracy Matrix:")
print(accuracy_matrix)

# Save accuracy matrix to CSV
accuracy_matrix.to_csv("accuracy_matrix.csv")

# Find the best sampling technique for each model
best_methods = accuracy_matrix.idxmax()
print("Best Sampling Technique for Each Model:")
print(best_methods)

# Save best combinations to a CSV file
best_methods.to_csv("best_combinations.csv")


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 354, number of negative: 346
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7011
[LightGBM] [Info] Number of data points in the train set: 700, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505714 -> initscore=0.022858
[LightGBM] [Info] Start training from score 0.022858
Processing sampling technique: Technique2...
[LightGBM] [Info] Number of positive: 358, number of negative: 342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6997
[LightGBM] [Info] Number of data points in the train set: 700, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511429 -> initscore=0.045722
[LightGBM] [Info] Start training fro

Parameters: { "use_label_encoder" } are not used.



Processing sampling technique: Technique3...
[LightGBM] [Info] Number of positive: 550, number of negative: 518
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7638
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.514981 -> initscore=0.059943
[LightGBM] [Info] Start training from score 0.059943


Parameters: { "use_label_encoder" } are not used.



Processing sampling technique: Technique4...
[LightGBM] [Info] Number of positive: 337, number of negative: 363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7011
[LightGBM] [Info] Number of data points in the train set: 700, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.481429 -> initscore=-0.074320
[LightGBM] [Info] Start training from score -0.074320


Parameters: { "use_label_encoder" } are not used.



Processing sampling technique: Technique5...
[LightGBM] [Info] Number of positive: 338, number of negative: 362
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7004
[LightGBM] [Info] Number of data points in the train set: 700, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.482857 -> initscore=-0.068598
[LightGBM] [Info] Start training from score -0.068598


Parameters: { "use_label_encoder" } are not used.



Accuracy Matrix:
            NaiveBayes   XGBoost  LightGBM  Perceptron  GradBoost
Technique1    0.773333  0.980000  0.990000    0.716667   0.986667
Technique2    0.876667  0.970000  0.983333    0.503333   0.973333
Technique3    0.838428  0.980349  0.986900    0.462882   0.982533
Technique4    0.866667  0.986667  0.993333    0.600000   0.990000
Technique5    0.783333  0.990000  0.986667    0.620000   0.980000
Best Sampling Technique for Each Model:
NaiveBayes    Technique2
XGBoost       Technique5
LightGBM      Technique4
Perceptron    Technique1
GradBoost     Technique4
dtype: object
