In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

from Data_Setup_Main import prepare_data
X, y, preprocessor = prepare_data()


   fixture_id     season season_start_date season_end_date country league  \
0     1320059  2014-2015        2014-08-01      2015-05-22  France  FraL2   
1     1320060  2014-2015        2014-08-01      2015-05-22  France  FraL2   
2     1320061  2014-2015        2014-08-01      2015-05-22  France  FraL2   
3     1320062  2014-2015        2014-08-01      2015-05-22  France  FraL2   
4     1320063  2014-2015        2014-08-01      2015-05-22  France  FraL2   

   competition_level     kick_off_datetime       team1_name    team2_name  \
0                  2  2014-08-01T18:00:00Z            Arles       Ajaccio   
1                  2  2014-08-01T18:00:00Z       AJ Auxerre      Le Havre   
2                  2  2014-08-04T18:30:00Z            Brest      Clermont   
3                  2  2014-08-01T18:00:00Z  Gazelec Ajaccio  Valenciennes   
4                  2  2014-08-01T18:00:00Z      Chateauroux        Troyes   

   ...  stadium_surface  stadium_runningtrack  stadium_capacity  \
0  ... 

In [34]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [35]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=123)

alpha_grid = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 1e-2, 1e-1]
results = []

for a in alpha_grid:
    pipe_try = Pipeline(steps=[
        ("prep", preprocessor),
        ("reg", MultiOutputRegressor(PoissonRegressor(alpha=a, max_iter=3000, tol=1e-6)))
    ])
    pipe_try.fit(X_tr, y_tr)
    y_val_pred = np.clip(pipe_try.predict(X_val), 0, None)

    # macro MSE across all outputs 
    macro_mse = mean_squared_error(y_val, y_val_pred, multioutput="uniform_average")
    results.append((a, macro_mse))

best_alpha, best_macro_mse = min(results, key=lambda t: t[1])
print("Alpha sweep (alpha, macro MSE):")
print(pd.DataFrame(results, columns=["alpha","macro_MSE"]).sort_values("macro_MSE"))
print(f"\nBest alpha on validation: {best_alpha} (macro MSE={best_macro_mse:.6f})")


Alpha sweep (alpha, macro MSE):
      alpha  macro_MSE
8  0.100000   0.860828
7  0.010000   0.865284
6  0.001000   0.872425
5  0.000300   0.875388
4  0.000100   0.877538
3  0.000030   0.879142
2  0.000010   0.880000
1  0.000003   0.880462
0  0.000001   0.880672

Best alpha on validation: 0.1 (macro MSE=0.860828)


In [36]:
# Pipeline: preprocess -> Poisson (wrapped for multi-output)
pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", MultiOutputRegressor(
        PoissonRegressor(alpha=best_alpha, max_iter=1000, tol=1e-6)  
    ))
])

# Fit
pipe.fit(X_train, y_train)

# Predict
y_pred = pipe.predict(X_test)
# Clip negatives to 0
y_pred = np.clip(y_pred, 0, None)

In [37]:
targets = y.columns.tolist()
rows = []
for i, name in enumerate(targets):
    ytest = y_test.iloc[:, i].values
    ypred = y_pred[:, i]

    mse  = mean_squared_error(ytest, ypred)
    mae  = mean_absolute_error(ytest, ypred)
    rmse = np.sqrt(mse)
    r2   = r2_score(ytest, ypred)

    rows.append({"target": name, "MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2})

metrics_df = pd.DataFrame(rows)

print("\n=== Per-target metrics ===")
print(metrics_df.round(4))


=== Per-target metrics ===
     target     MSE     MAE    RMSE      R2
0  team1_yc  1.3643  0.9534  1.1680  0.0721
1  team2_yc  1.5319  0.9984  1.2377  0.0184
2  team1_rc  0.0456  0.1007  0.2136  0.0024
3  team2_rc  0.0770  0.1420  0.2776  0.0022


In [38]:
# Quick peek at first 5 predictions for sanity check
print("\nSample predictions (first 5 rows, order:", targets, ")")
print(np.round(y_pred[:5], 3))


Sample predictions (first 5 rows, order: ['team1_yc', 'team2_yc', 'team1_rc', 'team2_rc'] )
[[1.804 1.819 0.057 0.064]
 [1.745 1.512 0.068 0.07 ]
 [1.213 1.424 0.057 0.059]
 [1.565 1.713 0.057 0.072]
 [1.743 1.793 0.048 0.081]]


In [39]:

w = y_test.var(axis=0, ddof=0)   # variance per target (Series aligned with columns)

weights = w / w.sum()

print("\n=== Weights used (sum to 1) ===")
print(weights.round(4))

# compute weighted averages
weighted_mse  = np.average(metrics_df["MSE"].values,  weights=weights.values)
weighted_mae  = np.average(metrics_df["MAE"].values,  weights=weights.values)
weighted_rmse = np.average(metrics_df["RMSE"].values, weights=weights.values)
weighted_r2   = np.average(metrics_df["R2"].values,   weights=weights.values)

print("\n=== Weighted average across targets ===")
print(pd.Series({
    "MSE":  weighted_mse,
    "MAE":  weighted_mae,
    "RMSE": weighted_rmse,
    "R2":   weighted_r2
}).round(4))



=== Weights used (sum to 1) ===
team1_yc    0.4662
team2_yc    0.4948
team1_rc    0.0145
team2_rc    0.0245
dtype: float64

=== Weighted average across targets ===
MSE     1.3966
MAE     0.9435
RMSE    1.1669
R2      0.0428
dtype: float64


In [40]:
# True vs Predicted side-by-side (first 10 rows) 
y_pred_rounded = np.rint(y_pred).clip(min=0).astype(int)

y_pred_df = pd.DataFrame(y_pred_rounded, columns=[f"{c}_pred" for c in targets], index=y_test.index)

comparison = pd.concat([y_test.reset_index(drop=True), y_pred_df.reset_index(drop=True)], axis=1)

print("\n=== True vs Predicted (first 10 rows) ===")
print(comparison.head(10).round(2))



=== True vs Predicted (first 10 rows) ===
   team1_yc  team2_yc  team1_rc  team2_rc  team1_yc_pred  team2_yc_pred  \
0         6         2         0         0              2              2   
1         0         0         0         0              2              2   
2         2         2         0         0              1              1   
3         1         1         0         0              2              2   
4         3         1         0         0              2              2   
5         0         1         0         0              1              2   
6         2         5         0         0              1              2   
7         2         1         0         0              2              2   
8         0         0         0         0              1              2   
9         2         5         0         0              2              2   

   team1_rc_pred  team2_rc_pred  
0              0              0  
1              0              0  
2              0             