In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

from Data_Setup_Main import prepare_data
X, y, preprocessor = prepare_data()


   fixture_id     season season_start_date season_end_date country league  \
0     1320059  2014-2015        2014-08-01      2015-05-22  France  FraL2   
1     1320060  2014-2015        2014-08-01      2015-05-22  France  FraL2   
2     1320061  2014-2015        2014-08-01      2015-05-22  France  FraL2   
3     1320062  2014-2015        2014-08-01      2015-05-22  France  FraL2   
4     1320063  2014-2015        2014-08-01      2015-05-22  France  FraL2   

   competition_level     kick_off_datetime       team1_name    team2_name  \
0                  2  2014-08-01T18:00:00Z            Arles       Ajaccio   
1                  2  2014-08-01T18:00:00Z       AJ Auxerre      Le Havre   
2                  2  2014-08-04T18:30:00Z            Brest      Clermont   
3                  2  2014-08-01T18:00:00Z  Gazelec Ajaccio  Valenciennes   
4                  2  2014-08-01T18:00:00Z      Chateauroux        Troyes   

   ...  stadium_surface  stadium_runningtrack  stadium_capacity  \
0  ... 

In [38]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# Build full pipeline: preprocess -> linear regression
pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# Fit
pipe.fit(X_train, y_train)

# redict
y_pred = pipe.predict(X_test)
# Clip negatives to 0
y_pred = np.clip(y_pred, 0, None)

In [39]:
targets = y.columns.tolist()
rows = []
for i, name in enumerate(targets):
    ytest = y_test.iloc[:, i].values
    ypred = y_pred[:, i]

    mse  = mean_squared_error(ytest, ypred)
    mae  = mean_absolute_error(ytest, ypred)
    rmse = np.sqrt(mse)
    r2   = r2_score(ytest, ypred)

    rows.append({"target": name, "MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2})

metrics_df = pd.DataFrame(rows)

print("\n=== Per-target metrics ===")
print(metrics_df.round(4))


=== Per-target metrics ===
     target     MSE     MAE    RMSE      R2
0  team1_yc  1.3573  0.9412  1.1650  0.0769
1  team2_yc  1.6093  1.0164  1.2686 -0.0311
2  team1_rc  0.0471  0.1059  0.2171 -0.0302
3  team2_rc  0.0809  0.1448  0.2844 -0.0478


In [40]:
# quick peek at first 5 predictions for sanity check
print("\nSample predictions (first 5 rows, order:", targets, ")")
print(np.round(y_pred[:5], 3))


Sample predictions (first 5 rows, order: ['team1_yc', 'team2_yc', 'team1_rc', 'team2_rc'] )
[[2.136 1.885 0.02  0.097]
 [1.835 1.139 0.071 0.029]
 [0.937 1.313 0.081 0.047]
 [1.642 1.842 0.071 0.041]
 [1.95  1.681 0.058 0.096]]


In [41]:

w = y_test.var(axis=0, ddof=0)   # variance per target (Series aligned with columns)

weights = w / w.sum()

print("\n=== Weights used (sum to 1) ===")
print(weights.round(4))

# compute weighted averages
weighted_mse  = np.average(metrics_df["MSE"].values,  weights=weights.values)
weighted_mae  = np.average(metrics_df["MAE"].values,  weights=weights.values)
weighted_rmse = np.average(metrics_df["RMSE"].values, weights=weights.values)
weighted_r2   = np.average(metrics_df["R2"].values,   weights=weights.values)

print("\n=== Weighted average across targets ===")
print(pd.Series({
    "MSE":  weighted_mse,
    "MAE":  weighted_mae,
    "RMSE": weighted_rmse,
    "R2":   weighted_r2
}).round(4))



=== Weights used (sum to 1) ===
team1_yc    0.4662
team2_yc    0.4948
team1_rc    0.0145
team2_rc    0.0245
dtype: float64

=== Weighted average across targets ===
MSE     1.4317
MAE     0.9468
RMSE    1.1810
R2      0.0188
dtype: float64


In [42]:
# True vs Predicted side-by-side (first 10 rows) 
y_pred_rounded = np.rint(y_pred).clip(min=0).astype(int)

y_pred_df = pd.DataFrame(y_pred_rounded, columns=[f"{c}_pred" for c in targets], index=y_test.index)

comparison = pd.concat([y_test.reset_index(drop=True), y_pred_df.reset_index(drop=True)], axis=1)

print("\n=== True vs Predicted (first 10 rows) ===")
print(comparison.head(10).round(2))



=== True vs Predicted (first 10 rows) ===
   team1_yc  team2_yc  team1_rc  team2_rc  team1_yc_pred  team2_yc_pred  \
0         6         2         0         0              2              2   
1         0         0         0         0              2              1   
2         2         2         0         0              1              1   
3         1         1         0         0              2              2   
4         3         1         0         0              2              2   
5         0         1         0         0              1              2   
6         2         5         0         0              1              2   
7         2         1         0         0              2              1   
8         0         0         0         0              1              1   
9         2         5         0         0              2              2   

   team1_rc_pred  team2_rc_pred  
0              0              0  
1              0              0  
2              0             