In [22]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree   import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    accuracy_score,
    recall_score
)
warnings.filterwarnings('ignore')

### 1. Loading Data

In [23]:
X_train = np.load('Artifacts/X_train.npz')['arr_0']
Y_train = np.load('Artifacts/Y_train.npz')['arr_0']
X_test = np.load('Artifacts/X_test.npz')['arr_0']
Y_test = np.load('Artifacts/Y_test.npz')['arr_0']

### 2. Define Parameter Grid

In [24]:
# Define parameter grids for hyperparameter tuning

lr_param_grid ={
    'max_iter': [500, 1000, 5000, 10000],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
    
}

dt_param_grid = {
    'max_depth': [4, 6, 8, 12, 16, 20],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 8, 12, 16],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [3, 4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

cat_param_grid ={
    'iterations': [200, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128]
}

param_grids ={
    'logistic_regression': lr_param_grid,
    'decision_tree': dt_param_grid,
    'random_forest': rf_param_grid,
    'xgboost' : xgb_param_grid,
    'catboost' : cat_param_grid
}

### 3. Define Multiple Models

In [25]:
models ={
    'logistic_regression': LogisticRegression(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(),
    'xgboost' : XGBClassifier(),
    'catboost' : CatBoostClassifier(verbose=0, random_state=42)    
}

### 4. Configure K-Fold CV

In [26]:
cv = StratifiedKFold(
    n_splits=6,
    random_state=42,
    shuffle=True

)

In [29]:
grid_search_results = {}
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")

    param_grid = param_grids[model_name]

    grid_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,            # more iterations for wider search
        scoring='f1',   # change to 'f1', 'roc_auc' if needed
        cv=cv,
        random_state=42,
        n_jobs=-1
    )
    


    print(f"Fitting grid search for {model_name}")
    grid_search.fit(X_train, Y_train)

    grid_search_results[model_name] = grid_search

    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")


Training logistic_regression...
Fitting grid search for logistic_regression
logistic_regression best parameters: {'solver': 'lbfgs', 'max_iter': 500, 'C': 10}
logistic_regression best score: 0.7986

Training decision_tree...
Fitting grid search for decision_tree
decision_tree best parameters: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 12, 'criterion': 'entropy'}
decision_tree best score: 0.8014

Training random_forest...
Fitting grid search for random_forest
random_forest best parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 16, 'criterion': 'log_loss'}
random_forest best score: 0.8481

Training xgboost...
Fitting grid search for xgboost
xgboost best parameters: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}
xgboost best score: 0.8484

Training catboost...
Fitting grid search for catboost
catboost best parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 3

In [28]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.model_selection import train_test_split

# # Models
# models = {
#     'logistic_regression': LogisticRegression(),
#     'decision_tree': DecisionTreeClassifier(),
#     'random_forest': RandomForestClassifier(),
#     'xgboost': XGBClassifier(),
#     'catboost': CatBoostClassifier(verbose=0, random_state=42)
# }

# # Expanded parameter grids
# param_grids = {
#     'logistic_regression': {
#         'max_iter': [500, 1000, 5000, 10000],
#         'C': [0.01, 0.1, 1, 10],          # regularization strength
#         'solver': ['lbfgs', 'liblinear']  # solver options
#     },
#     'decision_tree': {
#         'max_depth': [4, 6, 8, 12, 16, 20],
#         'criterion': ['gini', 'entropy', 'log_loss'],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4]
#     },
#     'random_forest': {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [6, 8, 12, 16],
#         'criterion': ['gini', 'entropy', 'log_loss'],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4]
#     },
#     'xgboost': {
#         'n_estimators': [100, 200, 250, 300],
#         'max_depth': [3, 4, 6, 8, 10],
#         'learning_rate': [0.01, 0.05, 0.1, 0.2],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'gamma': [0, 0.1, 0.2]
#     },
#     'catboost': {
#         'iterations': [200, 500, 1000],
#         'depth': [4, 6, 8, 10],
#         'learning_rate': [0.01, 0.05, 0.1, 0.2],
#         'l2_leaf_reg': [1, 3, 5, 7],
#         'border_count': [32, 64, 128]
#     }
# }

# # Split dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Randomized search for each model
# best_models = {}

# for name, model in models.items():
#     print(f"Running RandomizedSearchCV for {name}...")
    
#     search = RandomizedSearchCV(
#         estimator=model,
#         param_distributions=param_grids[name],
#         n_iter=20,            # more iterations for wider search
#         scoring='accuracy',   # change to 'f1', 'roc_auc' if needed
#         cv=3,
#         random_state=42,
#         n_jobs=-1
#     )
    
#     search.fit(X_train, y_train)
#     best_models[name] = search.best_estimator_
    
#     # Evaluate
#     y_pred = search.predict(X_test)
#     print(f"Best Parameters: {search.best_params_}")
#     print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#     print("Classification Report:\n", classification_report(y_test, y_pred))
#     print("--------------------------------------------------")
