In [4]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    r2_score,
    accuracy_score, 
    recall_score, 
    precision_score, 
    f1_score,
    confusion_matrix
)

from sklearn.linear_model import (
    LinearRegression, 
    PoissonRegressor, 
    GammaRegressor, 
    TweedieRegressor, 
    LogisticRegression)

import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
df = pd.read_csv("your_data.csv")

X = df.drop("target", axis=1)
y = df["target"]

is_classification = y.nunique() <= 10 and y.dtype != float

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
if is_classification:
    models = {
        "Logistic Regression (GLM)": {
            "model": LogisticRegression(max_iter=1000),
            "params": {
                "model__C": [0.01, 0.1, 1, 10]
            }
        }
    }
else:
    models = {
        "Linear Regression (Gaussian GLM)": {"model": LinearRegression(), "params": {}},
        "Poisson GLM": {
            "model": PoissonRegressor(max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
        "Gamma GLM": {
            "model": GammaRegressor(max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
        "Tweedie GLM (compound Poisson)": {
            "model": TweedieRegressor(power=1.5, max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
    }

results = []

In [None]:
for name, cfg in models.items():
    print(f"\nRunning → {name}")

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", cfg["model"])
    ])

    tuning = GridSearchCV(
        pipe, cfg["params"], cv=5,
        scoring="neg_mean_squared_error" if not is_classification else "accuracy",
        n_jobs=-1, refit=True
    )
    tuning.fit(X_train, y_train)

    y_pred = tuning.predict(X_test)

    # METRICS — CLASSIFICATION
    if is_classification:
        results.append({
            "Model": name,
            "Best Params": tuning.best_params_,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "F1 Score": f1_score(y_test, y_pred),
        })

        print(confusion_matrix(y_test, y_pred))
        continue


    # METRICS — REGRESSION (AIC + BIC INCLUDED)
    # Residual sum of squares
    RSS = np.sum((y_test - y_pred) ** 2)
    n = len(y_test)
    k = X_train.shape[1]  # number of parameters

    AIC = n * np.log(RSS / n) + 2 * k
    BIC = n * np.log(RSS / n) + k * np.log(n)

    results.append({
        "Model": name,
        "Best Params": tuning.best_params_,
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R² Score": r2_score(y_test, y_pred),
        "AIC": AIC,
        "BIC": BIC
    })


In [None]:
results_df = pd.DataFrame(results)
print("\n===== MODEL PERFORMANCE SUMMARY =====")
print(results_df)

In [None]:
if is_classification:
    plt.figure(figsize=(10, 4))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d")
    plt.title("Confusion Matrix")
    plt.show()
else:
    results_df.plot(x="Model", y="AIC", kind="bar", title="AIC Comparison")
    plt.show()

    results_df.plot(x="Model", y="BIC", kind="bar", title="BIC Comparison")
    plt.show()