In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    r2_score,
    accuracy_score, 
    recall_score, 
    precision_score, 
    f1_score,
    confusion_matrix
)

from sklearn.linear_model import (
    LinearRegression, 
    PoissonRegressor, 
    GammaRegressor, 
    TweedieRegressor, 
    LogisticRegression)

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
np.random.seed(42)

# Generate synthetic dataset
n_samples = 500
n_features = 5

# Features
X = np.random.normal(0, 1, size=(n_samples, n_features))

# Coefficients
coef = np.array([1.5, -2.0, 0.5, 3.0, -1.0])

# Linear combination + noise
linear_part = X @ coef

# Target variable: Gaussian (for LinearRegression)
y_gaussian = linear_part + np.random.normal(0, 1, n_samples)

# Target variable: Poisson (counts, for PoissonRegressor)
y_poisson = np.random.poisson(np.exp(linear_part / 3))  # scale to avoid huge counts

# Target variable: Gamma (positive continuous)
y_gamma = np.random.gamma(shape=2.0, scale=np.exp(linear_part / 5))

# Create DataFrame
df_regression = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(n_features)])
df_regression["target_gaussian"] = y_gaussian
df_regression["target_poisson"] = y_poisson
df_regression["target_gamma"] = y_gamma

print(df_regression.head())

   feature_0  feature_1  feature_2  feature_3  feature_4  target_gaussian  \
0   0.496714  -0.138264   0.647689   1.523030  -0.234153         6.719300   
1  -0.234137   1.579213   0.767435  -0.469474   0.542560        -4.108712   
2  -0.463418  -0.465730   0.241962  -1.913280  -1.724918        -3.988917   
3  -0.562288  -1.012831   0.314247  -0.908024  -1.412304        -0.584651   
4   1.465649  -0.225776   0.067528  -1.424748  -0.544383        -2.131223   

   target_poisson  target_gamma  
0               7     14.142504  
1               0      0.116827  
2               0      0.881585  
3               0      2.776460  
4               1      0.892098  


In [None]:
#df = pd.read_csv("your_data.csv")

In [None]:
#add tests for he distribuiton - this can the feed into the model fitting

In [None]:


df = df_regression.copy().drop(['target_poisson', 'target_gamma'], axis =1)

X = df.drop("target_gaussian", axis=1)
y = df["target_gaussian"]

is_classification = y.nunique() <= 10 and y.dtype != float

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
if is_classification:
    models = {
        "Logistic Regression (GLM)": {
            "model": LogisticRegression(max_iter=1000),
            "params": {
                "model__C": [0.01, 0.1, 1, 10]
            }
        }
    }
else:
    models = {
        "Linear Regression (Gaussian GLM)": {"model": LinearRegression(), "params": {}},
        "Poisson GLM": {
            "model": PoissonRegressor(max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
        "Gamma GLM": {
            "model": GammaRegressor(max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
        "Tweedie GLM (compound Poisson)": {
            "model": TweedieRegressor(power=1.5, max_iter=500),
            "params": {"model__alpha": [0.01, 0.1, 1.0]}
        },
    }

results = []

In [11]:
for name, cfg in models.items():
    print(f"\nRunning → {name}")

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", cfg["model"])
    ])

    tuning = GridSearchCV(
        pipe, cfg["params"], cv=5,
        scoring="neg_mean_squared_error" if not is_classification else "accuracy",
        n_jobs=-1, refit=True
    )
    tuning.fit(X_train, y_train)

    y_pred = tuning.predict(X_test)

    # METRICS — CLASSIFICATION
    if is_classification:
        results.append({
            "Model": name,
            "Best Params": tuning.best_params_,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "F1 Score": f1_score(y_test, y_pred),
        })

        print(confusion_matrix(y_test, y_pred))
        continue


    # METRICS — REGRESSION (AIC + BIC INCLUDED)
    # Residual sum of squares
    RSS = np.sum((y_test - y_pred) ** 2)
    n = len(y_test)
    k = X_train.shape[1]  # number of parameters

    AIC = n * np.log(RSS / n) + 2 * k
    BIC = n * np.log(RSS / n) + k * np.log(n)

    results.append({
        "Model": name,
        "Best Params": tuning.best_params_,
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R² Score": r2_score(y_test, y_pred),
        "AIC": AIC,
        "BIC": BIC
    })



Running → Linear Regression (Gaussian GLM)

Running → Poisson GLM


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Dominic\Documents\Quant\Model Templates\.conda\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Dominic\Documents\Quant\Model Templates\.conda\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Dominic\Documents\Quant\Model Templates\.conda\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\Dominic\Documents\Quant\Model Templates\.conda\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Dominic\Documents\Quant\Model Templates\.conda\Lib\site-packages\sklearn\linear_model\_glm\glm.py", line 225, in fit
    raise ValueError(
ValueError: Some value(s) of y are out of the valid range of the loss 'HalfPoissonLoss'.


In [None]:
results_df = pd.DataFrame(results)
print("\n===== MODEL PERFORMANCE SUMMARY =====")
print(results_df)

In [None]:
if is_classification:
    plt.figure(figsize=(10, 4))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d")
    plt.title("Confusion Matrix")
    plt.show()
else:
    results_df.plot(x="Model", y="AIC", kind="bar", title="AIC Comparison")
    plt.show()

    results_df.plot(x="Model", y="BIC", kind="bar", title="BIC Comparison")
    plt.show()