In [2]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [4]:
file_path = "Creditcard_data.csv"  # Replace with your file path
dataset = pd.read_csv(file_path)

In [5]:
majority = dataset[dataset['Class'] == 0]
minority = dataset[dataset['Class'] == 1]

In [6]:
minority_oversampled = resample(
    minority, replace=True, n_samples=len(majority), random_state=42
)


In [7]:
balanced_dataset = pd.concat([majority, minority_oversampled]).sample(frac=1, random_state=42)


In [8]:
sample_size = int(0.2 * len(balanced_dataset))
samples = {
    f"Sampling{i+1}": balanced_dataset.sample(n=sample_size, random_state=42+i)
    for i in range(5)
}


In [13]:
models = {
    "M1 (Logistic Regression)": LogisticRegression(max_iter=2000, random_state=42),
    "M2 (Random Forest)": RandomForestClassifier(random_state=42),
    "M3 (SVM)": SVC(random_state=42),
    "M4 (Decision Tree)": DecisionTreeClassifier(random_state=42),
    "M5 (K-Nearest Neighbors)": KNeighborsClassifier()
}

In [14]:
results = {sample_name: {} for sample_name in samples.keys()}

for sample_name, sample in samples.items():
    # Split into features and target
    X = sample.drop(columns=["Class"])
    y = sample["Class"]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train each model
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[sample_name][model_name] = accuracy

# Step 5: Display results
results_df = pd.DataFrame(results)
print("Accuracy Results (Columns are Samples, Rows are Models):")
print(results_df)

Accuracy Results (Columns are Samples, Rows are Models):
                          Sampling1  Sampling2  Sampling3  Sampling4  \
M1 (Logistic Regression)   0.923913   0.913043   0.891304   0.880435   
M2 (Random Forest)         1.000000   0.989130   0.989130   0.989130   
M3 (SVM)                   0.641304   0.684783   0.706522   0.717391   
M4 (Decision Tree)         0.978261   0.923913   0.934783   0.989130   
M5 (K-Nearest Neighbors)   0.923913   0.956522   0.891304   0.945652   

                          Sampling5  
M1 (Logistic Regression)   0.891304  
M2 (Random Forest)         1.000000  
M3 (SVM)                   0.684783  
M4 (Decision Tree)         0.989130  
M5 (K-Nearest Neighbors)   0.902174  
