In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

from Data_Setup_Main import prepare_data
X, y, preprocessor = prepare_data()


   fixture_id     season season_start_date season_end_date country league  \
0     1320059  2014-2015        2014-08-01      2015-05-22  France  FraL2   
1     1320060  2014-2015        2014-08-01      2015-05-22  France  FraL2   
2     1320061  2014-2015        2014-08-01      2015-05-22  France  FraL2   
3     1320062  2014-2015        2014-08-01      2015-05-22  France  FraL2   
4     1320063  2014-2015        2014-08-01      2015-05-22  France  FraL2   

   competition_level     kick_off_datetime       team1_name    team2_name  \
0                  2  2014-08-01T18:00:00Z            Arles       Ajaccio   
1                  2  2014-08-01T18:00:00Z       AJ Auxerre      Le Havre   
2                  2  2014-08-04T18:30:00Z            Brest      Clermont   
3                  2  2014-08-01T18:00:00Z  Gazelec Ajaccio  Valenciennes   
4                  2  2014-08-01T18:00:00Z      Chateauroux        Troyes   

   ...  stadium_surface  stadium_runningtrack  stadium_capacity  \
0  ... 

In [30]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [31]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=123
)

# Manual sweep over K and weights; pick best on VAL macro-MSE
k_grid = [1, 3, 5, 7, 9, 11, 15, 21, 31, 41, 51, 61, 71]
weight_opts = ["uniform", "distance"]
results = []

for k in k_grid:
    for w in weight_opts:
        pipe_try = Pipeline(steps=[
            ("prep", preprocessor),
            ("reg", KNeighborsRegressor(n_neighbors=k, weights=w, n_jobs=-1))
        ])
        pipe_try.fit(X_tr, y_tr)
        y_val_pred = np.clip(pipe_try.predict(X_val), 0, None)

        macro_mse = mean_squared_error(y_val, y_val_pred, multioutput="uniform_average")
        results.append((k, w, macro_mse))

# Pick best (lowest macro MSE)
best_k, best_w, best_macro_mse = min(results, key=lambda t: t[2])

print("KNN sweep (k, weights, macro_MSE on VAL):")
print(pd.DataFrame(results, columns=["k","weights","macro_MSE"]).sort_values("macro_MSE"))
print(f"\nBest on validation: k={best_k}, weights='{best_w}'  (macro MSE={best_macro_mse:.6f})")


KNN sweep (k, weights, macro_MSE on VAL):
     k   weights  macro_MSE
24  71   uniform   0.881727
25  71  distance   0.882044
22  61   uniform   0.882139
23  61  distance   0.882625
20  51   uniform   0.888324
21  51  distance   0.888647
18  41   uniform   0.894067
19  41  distance   0.894187
16  31   uniform   0.900300
17  31  distance   0.900493
15  21  distance   0.916260
14  21   uniform   0.917055
12  15   uniform   0.919417
13  15  distance   0.919672
10  11   uniform   0.931295
11  11  distance   0.932637
8    9   uniform   0.946564
9    9  distance   0.947190
6    7   uniform   0.975748
7    7  distance   0.976369
4    5   uniform   1.017044
5    5  distance   1.018331
2    3   uniform   1.081914
3    3  distance   1.085731
1    1  distance   1.563333
0    1   uniform   1.563333

Best on validation: k=71, weights='uniform'  (macro MSE=0.881727)


In [32]:
print(f"Chosen KNN: n_neighbors={best_k}, weights='{best_w}'")


Chosen KNN: n_neighbors=71, weights='uniform'


In [33]:
# Pipeline: preprocess -> KNN (wrapped for multi-output)
pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", KNeighborsRegressor(n_neighbors=best_k, weights=best_w, n_jobs=-1))
])


# Fit
pipe.fit(X_train, y_train)

# Predict
y_pred = pipe.predict(X_test)
# Clip negatives to 0
y_pred = np.clip(y_pred, 0, None)

In [34]:
targets = y.columns.tolist()
rows = []
for i, name in enumerate(targets):
    ytest = y_test.iloc[:, i].values
    ypred = y_pred[:, i]

    mse  = mean_squared_error(ytest, ypred)
    mae  = mean_absolute_error(ytest, ypred)
    rmse = np.sqrt(mse)
    r2   = r2_score(ytest, ypred)

    rows.append({"target": name, "MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2})

metrics_df = pd.DataFrame(rows)

print("\n=== Per-target metrics ===")
print(metrics_df.round(4))


=== Per-target metrics ===
     target     MSE     MAE    RMSE      R2
0  team1_yc  1.4382  0.9856  1.1993  0.0218
1  team2_yc  1.5578  1.0124  1.2481  0.0019
2  team1_rc  0.0467  0.1032  0.2161 -0.0211
3  team2_rc  0.0787  0.1401  0.2806 -0.0195


In [35]:
# Quick peek at first 5 predictions for sanity check
print("\nSample predictions (first 5 rows, order:", targets, ")")
print(np.round(y_pred[:5], 3))


Sample predictions (first 5 rows, order: ['team1_yc', 'team2_yc', 'team1_rc', 'team2_rc'] )
[[1.606 1.634 0.056 0.028]
 [1.662 1.648 0.127 0.056]
 [1.408 1.521 0.07  0.07 ]
 [1.592 1.817 0.056 0.056]
 [1.408 1.986 0.07  0.07 ]]


In [36]:

w = y_test.var(axis=0, ddof=0)   # variance per target (Series aligned with columns)

weights = w / w.sum()

print("\n=== Weights used (sum to 1) ===")
print(weights.round(4))

# compute weighted averages
weighted_mse  = np.average(metrics_df["MSE"].values,  weights=weights.values)
weighted_mae  = np.average(metrics_df["MAE"].values,  weights=weights.values)
weighted_rmse = np.average(metrics_df["RMSE"].values, weights=weights.values)
weighted_r2   = np.average(metrics_df["R2"].values,   weights=weights.values)

print("\n=== Weighted average across targets ===")
print(pd.Series({
    "MSE":  weighted_mse,
    "MAE":  weighted_mae,
    "RMSE": weighted_rmse,
    "R2":   weighted_r2
}).round(4))



=== Weights used (sum to 1) ===
team1_yc    0.4662
team2_yc    0.4948
team1_rc    0.0145
team2_rc    0.0245
dtype: float64

=== Weighted average across targets ===
MSE     1.4439
MAE     0.9654
RMSE    1.1867
R2      0.0103
dtype: float64


In [37]:
# True vs Predicted side-by-side (first 10 rows) 
y_pred_rounded = np.rint(y_pred).clip(min=0).astype(int)

y_pred_df = pd.DataFrame(y_pred_rounded, columns=[f"{c}_pred" for c in targets], index=y_test.index)

comparison = pd.concat([y_test.reset_index(drop=True), y_pred_df.reset_index(drop=True)], axis=1)

print("\n=== True vs Predicted (first 10 rows) ===")
print(comparison.head(10).round(2))



=== True vs Predicted (first 10 rows) ===
   team1_yc  team2_yc  team1_rc  team2_rc  team1_yc_pred  team2_yc_pred  \
0         6         2         0         0              2              2   
1         0         0         0         0              2              2   
2         2         2         0         0              1              2   
3         1         1         0         0              2              2   
4         3         1         0         0              1              2   
5         0         1         0         0              1              2   
6         2         5         0         0              2              2   
7         2         1         0         0              2              2   
8         0         0         0         0              1              1   
9         2         5         0         0              1              2   

   team1_rc_pred  team2_rc_pred  
0              0              0  
1              0              0  
2              0             