In [1]:
!pip install imblearn
!pip install optuna


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn

   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 2/2 [imblearn]

Successfully installed


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, roc_auc_score

# --- Base Models ---
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# --- Ensemble & Meta-Model ---
from sklearn.ensemble import StackingClassifier

# --- Pipeline Tools ---
# Use the imblearn pipeline to handle SMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [5]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, roc_auc_score

# --- Base Models ---
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# --- Ensemble & Meta-Model ---
from sklearn.ensemble import StackingClassifier

# --- Pipeline Tools ---
# SMOTE is removed, so we use the standard sklearn Pipeline
from sklearn.pipeline import Pipeline
# from imblearn.over_sampling import SMOTE # <-- SMOTE REMOVED

# --- 1. Load Data ---
try:
    creditcard = pd.read_csv('creditcard.csv', sep=',')
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found.")
    print("Please download the dataset from Kaggle and place it in the same directory.")
    # As a fallback, create dummy data to allow the script to run
    print("Using dummy data to proceed...")
    X_raw = np.random.rand(1000, 30)
    y_raw = np.random.randint(0, 2, 1000)
    # Ensure at least one of each class for stratify
    y_raw[0] = 0
    y_raw[1] = 1
else:
    X_raw = creditcard.drop('Class', axis=1)
    y_raw = creditcard['Class']

# --- 2. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.3, stratify=y_raw, random_state=42)


# --- 3. Define the Optuna Objective Function ---
def objective(trial):
    """
    This function defines the entire ML pipeline and the
    hyperparameters that Optuna will tune.
    """
    
    # --- A. Define the parameter search space ---
    n_features = trial.suggest_int('n_features_to_select', 10, 25)
    # smote_k = trial.suggest_int('smote_k_neighbors', 3, 7) # <-- SMOTE REMOVED
    knn_n = trial.suggest_int('knn_n_neighbors', 3, 15, step=2)
    rf_n_est = trial.suggest_int('rf_n_estimators', 100, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 30)
    meta_c = trial.suggest_float('meta_C', 1e-2, 1e1, log=True)

    
    # --- B. Build the FULL pipeline with these parameters ---
    scaler = RobustScaler()
    
    rfe_estimator = LogisticRegression(solver='liblinear', class_weight='balanced')
    feature_selector = RFE(estimator=rfe_estimator, n_features_to_select=n_features, step=1)

    # smote = SMOTE(k_neighbors=smote_k, random_state=42) # <-- SMOTE REMOVED
    
    # --- FIX 1: Give parallel jobs to the base estimator ---
    clf_knn = KNeighborsClassifier(n_neighbors=knn_n)
    clf_rf = RandomForestClassifier(
        n_estimators=rf_n_est, 
        max_depth=rf_max_depth, 
        random_state=42,
        class_weight='balanced', # <-- Added to compensate for no SMOTE
        n_jobs=-1  # <--- TELL RANDOM FOREST TO USE ALL CORES
    )
    clf_ada = AdaBoostClassifier(random_state=42)
    
    meta_model_lr = LogisticRegression(
        C=meta_c, 
        solver='liblinear',
        class_weight='balanced' # <-- Added to compensate for no SMOTE
    )
    
    # --- FIX 2: Give parallel jobs to the stacker ---
    stacking_clf = StackingClassifier(
        estimators=[
            ('knn', clf_knn),
            ('rf', clf_rf),
            ('ada', clf_ada)
        ],
        final_estimator=meta_model_lr,
        cv=3,
        n_jobs=-1  # <--- TELL STACKER TO BUILD MODELS IN PARALLEL
    )
    
    # --- C. Create the Final Pipeline ---
    pipeline_stack = Pipeline([
        ('scaler', scaler),
        ('feature_selector', feature_selector),
        # ('smote', smote), # <-- SMOTE REMOVED
        ('stacker', stacking_clf)
    ])
    
    # --- D. Evaluate the pipeline ---
    # --- FIX 3: Make the outer loop serial ---
    score = cross_val_score(
        pipeline_stack, 
        X_train, 
        y_train, 
        n_jobs=1,  # <--- RUN 1 CV-FOLD AT A TIME (WAS -1)
        cv=3, 
        scoring='roc_auc'
    )
    
    return np.mean(score)

# --- 4. Create and Run the Optuna Study ---
print("Starting Optuna Hyperparameter Search (with parallel fix)...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=True) # Using 50 trials as per your code

# --- 5. Get the Best Results ---
print("\n--- Optuna Tuning Complete! ---")
print(f"Best ROC-AUC Score (from CV): {study.best_value:.4f}")
print("Best Parameters Found:")
print(study.best_params)

# --- 6. Train the Final Model with the Best Parameters ---
print("\nTraining final model with best parameters...")
best_params = study.best_params

# Re-build the *entire* pipeline using the best params
final_scaler = RobustScaler()
final_rfe_estimator = LogisticRegression(solver='liblinear', class_weight='balanced')
final_feature_selector = RFE(
    estimator=final_rfe_estimator,
    n_features_to_select=best_params['n_features_to_select'],
    step=1
)
# final_smote = SMOTE(k_neighbors=best_params['smote_k_neighbors'], random_state=42) # <-- SMOTE REMOVED
final_knn = KNeighborsClassifier(n_neighbors=best_params['knn_n_neighbors'])
final_rf = RandomForestClassifier(
    n_estimators=best_params['rf_n_estimators'],
    max_depth=best_params['rf_max_depth'],
    random_state=42,
    class_weight='balanced', # <-- Added to compensate for no SMOTE
    n_jobs=-1 # <-- Use all cores in final model
)
final_ada = AdaBoostClassifier(random_state=42)
final_meta_lr = LogisticRegression(
    C=best_params['meta_C'], 
    solver='liblinear',
    class_weight='balanced' # <-- Added to compensate for no SMOTE
)

final_stacker = StackingClassifier(
    estimators=[
        ('knn', final_knn),
        ('rf', final_rf),
        ('ada', final_ada)
    ],
    final_estimator=final_meta_lr,
    cv=5, 
    n_jobs=-1 # <-- Use all cores in final model
)

final_pipeline = Pipeline([
    ('scaler', final_scaler),
    ('feature_selector', final_feature_selector),
    # ('smote', final_smote), # <-- SMOTE REMOVED
    ('stacker', final_stacker)
])

final_pipeline.fit(X_train, y_train)

# --- 7. Evaluate the Final Model on Unseen Test Data ---
print("\n--- Final Tuned Model Evaluation on Test Set ---")
y_pred_final = final_pipeline.predict(X_test)
y_proba_final = final_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_final))
print(f"Test Set ROC-AUC Score: {roc_auc_score(y_test, y_proba_final):.4f}")

[I 2025-11-15 19:46:56,145] A new study created in memory with name: no-name-e4fe671d-4934-4d95-8264-7ba0d98f8023


Starting Optuna Hyperparameter Search (with parallel fix)...


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-11-15 19:51:36,009] Trial 0 finished with value: 0.9780653230961383 and parameters: {'n_features_to_select': 24, 'knn_n_neighbors': 15, 'rf_n_estimators': 201, 'rf_max_depth': 20, 'meta_C': 3.841218645050506}. Best is trial 0 with value: 0.9780653230961383.
[I 2025-11-15 19:55:57,741] Trial 1 finished with value: 0.9746155791524925 and parameters: {'n_features_to_select': 17, 'knn_n_neighbors': 13, 'rf_n_estimators': 188, 'rf_max_depth': 18, 'meta_C': 0.010446709893457029}. Best is trial 0 with value: 0.9780653230961383.
[I 2025-11-15 20:00:26,796] Trial 2 finished with value: 0.9786504990011697 and parameters: {'n_features_to_select': 13, 'knn_n_neighbors': 7, 'rf_n_estimators': 134, 'rf_max_depth': 11, 'meta_C': 0.49342831877226245}. Best is trial 2 with value: 0.9786504990011697.
[I 2025-11-15 20:05:22,906] Trial 3 finished with value: 0.971380975590462 and parameters: {'n_features_to_select': 14, 'knn_n_neighbors': 7, 'rf_n_estimators': 147, 'rf_max_depth': 26, 'meta_C': 9.

In [8]:
import joblib

In [9]:
joblib.dump(final_pipeline, 'fraud_detection_pipeline.joblib')

# Save the list of feature names
joblib.dump(FEATURE_NAMES, 'feature_names.joblib')

print("✅ Model and feature names saved successfully.")
print("Run 'streamlit run app.py' to start the web app.")

NameError: name 'FEATURE_NAMES' is not defined

In [10]:
import pandas as pd
import numpy as np
import optuna
import joblib  # For saving the model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, roc_auc_score

# --- Base Models ---
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# --- Ensemble & Meta-Model ---
from sklearn.ensemble import StackingClassifier

# --- Pipeline Tools ---
# We MUST use the imblearn Pipeline to include SMOTE
from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE

# --- 1. Load Data ---
try:
    creditcard = pd.read_csv('creditcard.csv', sep=',')
    X_raw = creditcard.drop('Class', axis=1)
    y_raw = creditcard['Class']
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found.")
    print("Please download the dataset from Kaggle and place it in the same directory.")
    # As a fallback, create dummy data to allow the script to run
    print("Using dummy data to proceed...")
    X_raw = pd.DataFrame(np.random.rand(1000, 30), columns=[f'V{i}' for i in range(29)] + ['Amount'])
    X_raw.rename(columns={'V0':'Time'}, inplace=True)
    y_raw = pd.Series(np.random.randint(0, 2, 1000))
    y_raw.iloc[0] = 0
    y_raw.iloc[1] = 1


# --- 2. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.3, stratify=y_raw, random_state=42)

# Save feature names - this is CRITICAL for deployment
FEATURE_NAMES = X_train.columns.to_list()


# --- 3. Define the Optuna Objective Function ---
def objective(trial):
    """
    This function defines the entire ML pipeline and the
    hyperparameters that Optuna will tune.
    """
    
    # --- A. Define the parameter search space ---
    n_features = trial.suggest_int('n_features_to_select', 10, 25)
    smote_k = trial.suggest_int('smote_k_neighbors', 3, 7) # <-- SMOTE parameter
    knn_n = trial.suggest_int('knn_n_neighbors', 3, 15, step=2)
    rf_n_est = trial.suggest_int('rf_n_estimators', 100, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 30)
    meta_c = trial.suggest_float('meta_C', 1e-2, 1e1, log=True)

    
    # --- B. Build the FULL pipeline with these parameters ---
    scaler = RobustScaler()
    
    # RFE estimator (class_weight='balanced' helps RFE itself)
    rfe_estimator = LogisticRegression(solver='liblinear', class_weight='balanced')
    feature_selector = RFE(estimator=rfe_estimator, n_features_to_select=n_features, step=1)

    # SMOTE object
    smote = SMOTE(k_neighbors=smote_k, random_state=42)

    # Base Models (n_jobs=-1 for parallelism)
    clf_knn = KNeighborsClassifier(n_neighbors=knn_n)
    clf_rf = RandomForestClassifier(
        n_estimators=rf_n_est, 
        max_depth=rf_max_depth, 
        random_state=42,
        n_jobs=-1  # <-- Use all cores
    )
    clf_ada = AdaBoostClassifier(random_state=42)
    
    # Meta Model (no class_weight needed, SMOTE handles it)
    meta_model_lr = LogisticRegression(
        C=meta_c, 
        solver='liblinear'
    )
    
    # Stacking Classifier (n_jobs=-1 for parallelism)
    stacking_clf = StackingClassifier(
        estimators=[
            ('knn', clf_knn),
            ('rf', clf_rf),
            ('ada', clf_ada)
        ],
        final_estimator=meta_model_lr,
        cv=3,
        n_jobs=-1 # <-- Use all cores
    )
    
    # --- C. Create the Final Pipeline ---
    # The order is critical: Scale -> Select Features -> Oversample -> Stack
    pipeline_stack = Pipeline([
        ('scaler', scaler),
        ('feature_selector', feature_selector),
        ('smote', smote),
        ('stacker', stacking_clf)
    ])
    
    # --- D. Evaluate the pipeline ---
    # n_jobs=1 forces the outer CV to be serial
    score = cross_val_score(
        pipeline_stack, 
        X_train, 
        y_train, 
        n_jobs=1,  # <-- Run 1 CV-fold at a time (THE FIX)
        cv=3, 
        scoring='roc_auc'
    )
    
    return np.mean(score)

# --- 4. Create and Run the Optuna Study ---
print("Starting Optuna Hyperparameter Search (with SMOTE + parallel fix)...")
study = optuna.create_study(direction='maximize')
# Using n_trials=50. Increase if you have time.
study.optimize(objective, n_trials=10, show_progress_bar=True) 

# --- 5. Get the Best Results ---
print("\n--- Optuna Tuning Complete! ---")
print(f"Best ROC-AUC Score (from CV): {study.best_value:.4f}")
print("Best Parameters Found:")
print(study.best_params)

# --- 6. Train the Final Model with the Best Parameters ---
print("\nTraining final model with best parameters...")
best_params = study.best_params

# Re-build the *entire* pipeline using the best params
final_scaler = RobustScaler()
final_rfe_estimator = LogisticRegression(solver='liblinear', class_weight='balanced')
final_feature_selector = RFE(
    estimator=final_rfe_estimator,
    n_features_to_select=best_params['n_features_to_select'],
    step=1
)
final_smote = SMOTE(k_neighbors=best_params['smote_k_neighbors'], random_state=42)
final_knn = KNeighborsClassifier(n_neighbors=best_params['knn_n_neighbors'])
final_rf = RandomForestClassifier(
    n_estimators=best_params['rf_n_estimators'],
    max_depth=best_params['rf_max_depth'],
    random_state=42,
    n_jobs=-1 # <-- Use all cores
)
final_ada = AdaBoostClassifier(random_state=42)
final_meta_lr = LogisticRegression(
    C=best_params['meta_C'], 
    solver='liblinear'
)

final_stacker = StackingClassifier(
    estimators=[
        ('knn', final_knn),
        ('rf', final_rf),
        ('ada', final_ada)
    ],
    final_estimator=final_meta_lr,
    cv=5, 
    n_jobs=-1 # <-- Use all cores
)

final_pipeline = Pipeline([
    ('scaler', final_scaler),
    ('feature_selector', final_feature_selector),
    ('smote', final_smote),
    ('stacker', final_stacker)
])

final_pipeline.fit(X_train, y_train)

# --- 7. Evaluate the Final Model on Unseen Test Data ---
print("\n--- Final Tuned Model Evaluation on Test Set ---")
y_pred_final = final_pipeline.predict(X_test)
y_proba_final = final_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_final))
print(f"Test Set ROC-AUC Score: {roc_auc_score(y_test, y_proba_final):.4f}")


# --- 8. SAVE THE FINAL MODEL AND FEATURE NAMES ---
print("\n--- Saving final model and feature names... ---")

# Save the trained pipeline
joblib.dump(final_pipeline, 'fraud_detection_pipeline.joblib')

# Save the list of feature names
joblib.dump(FEATURE_NAMES, 'feature_names.joblib')

print("✅ Model and feature names saved successfully.")
print("Run 'streamlit run app.py' to start the web app.")

[I 2025-11-15 20:32:46,634] A new study created in memory with name: no-name-7c8885c8-b8c4-409c-8619-628892757317


Starting Optuna Hyperparameter Search (with SMOTE + parallel fix)...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-11-15 20:44:27,599] Trial 0 finished with value: 0.9544549654864901 and parameters: {'n_features_to_select': 11, 'smote_k_neighbors': 5, 'knn_n_neighbors': 15, 'rf_n_estimators': 261, 'rf_max_depth': 29, 'meta_C': 0.032202295300139204}. Best is trial 0 with value: 0.9544549654864901.
[I 2025-11-15 20:55:21,814] Trial 1 finished with value: 0.9519918259272111 and parameters: {'n_features_to_select': 20, 'smote_k_neighbors': 4, 'knn_n_neighbors': 9, 'rf_n_estimators': 175, 'rf_max_depth': 20, 'meta_C': 3.5931387691795122}. Best is trial 0 with value: 0.9544549654864901.
[I 2025-11-15 21:30:24,916] Trial 2 finished with value: 0.9661210371322914 and parameters: {'n_features_to_select': 21, 'smote_k_neighbors': 7, 'knn_n_neighbors': 13, 'rf_n_estimators': 256, 'rf_max_depth': 20, 'meta_C': 0.03409615297401364}. Best is trial 2 with value: 0.9661210371322914.
[I 2025-11-15 21:40:59,972] Trial 3 finished with value: 0.9732313316179869 and parameters: {'n_features_to_select': 22, 'smo