In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load dataset
df = pd.read_csv('/content/Creditcard_data.csv')

# Function to balance the dataset using different techniques
def balance_dataset(features, labels, technique="SMOTE"):
    """
    Balances the dataset using the specified technique (SMOTE or undersampling).
    """
    if technique == "SMOTE":
        smote = SMOTE(random_state=42)
        features_resampled, labels_resampled = smote.fit_resample(features, labels)
    elif technique == "undersampling":
        rus = RandomUnderSampler(random_state=42)
        features_resampled, labels_resampled = rus.fit_resample(features, labels)
    else:
        raise ValueError("Unknown balancing technique specified")
    return features_resampled, labels_resampled

# Separate features and labels
X_features = df.drop("Class", axis=1)
y_labels = df["Class"]

# Apply balancing
X_resampled, y_resampled = balance_dataset(X_features, y_labels)

# Create different sample sizes
sample_sizes = [int(len(X_resampled) * 0.1 * i) for i in range(1, 6)]
sampled_data = [X_resampled.sample(n=size, random_state=42) for size in sample_sizes]
sampled_labels = [y_resampled.iloc[sample.index] for sample in sampled_data]

# Define models
models_dict = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest (default)": RandomForestClassifier(),
    "SVC (linear kernel)": SVC(kernel="linear"),
    "SVC (RBF kernel)": SVC(kernel="rbf"),
    "Random Forest (200 estimators)": RandomForestClassifier(n_estimators=200)
}

# Store results
accuracy_results = {}

# Loop over the sampled data and models to calculate accuracy
for index, (X_sample, y_sample) in enumerate(zip(sampled_data, sampled_labels)):
    sample_name = f"Sample_{index+1}"
    for model_name, model in models_dict.items():

        # Split into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)

        # Store the result in a dictionary
        if model_name not in accuracy_results:
            accuracy_results[model_name] = {}
        accuracy_results[model_name][sample_name] = accuracy

# Convert results to DataFrame for easier readability
accuracy_df = pd.DataFrame(accuracy_results).T
print("Model Performance Accuracy:\n", accuracy_df)

# Identify the best model and its performance across samples
best_model = accuracy_df.max(axis=1).idxmax()
print(f"The best performing model is: {best_model} with maximum accuracy across all samples.")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Performance Accuracy:
                                 Sample_1  Sample_2  Sample_3  Sample_4  \
Logistic Regression             0.935484  0.918033  0.869565  0.844262   
Random Forest (default)         0.967742  0.983607  0.956522  1.000000   
SVC (linear kernel)             0.967742  0.885246  0.847826  0.868852   
SVC (RBF kernel)                0.548387  0.721311  0.673913  0.655738   
Random Forest (200 estimators)  0.967742  0.983607  0.956522  0.991803   

                                Sample_5  
Logistic Regression             0.875817  
Random Forest (default)         1.000000  
SVC (linear kernel)             0.921569  
SVC (RBF kernel)                0.660131  
Random Forest (200 estimators)  1.000000  
The best performing model is: Random Forest (default) with maximum accuracy across all samples.
