In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load dataset
data = pd.read_csv('Creditcard_data.csv')

def balance_data(X, y, technique="SMOTE"):
    if technique == "SMOTE":
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
    elif technique == "undersampling":
        rus = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X, y)
    else:
        raise ValueError("Invalid balancing technique")
    return X_resampled, y_resampled

X = data.drop("Class", axis=1)
y = data["Class"]

# Apply balancing technique
X_balanced, y_balanced = balance_data(X, y)

# Define sample sizes and sample data
sample_sizes = [int(len(X_balanced) * 0.1 * i) for i in range(1, 6)]
samples = [X_balanced.sample(n=size, random_state=42) for size in sample_sizes]
sample_labels = [y_balanced.iloc[sample.index] for sample in samples]

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVC (Linear)": SVC(kernel="linear"),
    "SVC (RBF)": SVC(kernel="rbf"),
    "Random Forest (200 Estimators)": RandomForestClassifier(n_estimators=200)
}

# Define results dictionary to store accuracies
results = {}

# Train models and record accuracies
for i, (X_sample, y_sample) in enumerate(zip(samples, sample_labels)):
    technique_name = f"{(i + 1) * 10}%"  # Use more descriptive names
    for model_name, model in models.items():

        # Split data into train/test sets
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

        # Train model
        model.fit(X_train, y_train)

        # Predict and calculate accuracy
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        if model_name not in results:
            results[model_name] = {}
        results[model_name][technique_name] = accuracy

# Convert results dictionary into DataFrame
results_df = pd.DataFrame(results).T
print("Accuracy Results:\n", results_df)

# Find the model and technique with the highest accuracy
best_model = results_df.max().idxmax()  # Get the column name of the highest accuracy
best_technique = results_df[best_model].idxmax()  # Get the row name of the highest accuracy
best_accuracy = results_df[best_model].max()  # Get the highest accuracy score

print(f"\nBest Model: {best_model}")
print(f"Best Sampling Technique: {best_technique}")
print(f"Best Accuracy: {best_accuracy:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Results:
                                      10%       20%       30%       40%  \
Logistic Regression             0.935484  0.918033  0.869565  0.844262   
Random Forest                   0.967742  0.983607  0.956522  1.000000   
SVC (Linear)                    0.967742  0.885246  0.847826  0.868852   
SVC (RBF)                       0.548387  0.721311  0.673913  0.655738   
Random Forest (200 Estimators)  0.967742  0.983607  0.956522  0.991803   

                                     50%  
Logistic Regression             0.875817  
Random Forest                   1.000000  
SVC (Linear)                    0.921569  
SVC (RBF)                       0.660131  
Random Forest (200 Estimators)  1.000000  

Best Model: 40%
Best Sampling Technique: Random Forest
Best Accuracy: 1.0000
