In [1]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
import os
import joblib



In [2]:
def find_backend_dir(start_path=None):
    """
    Walk up directories from start_path (or cwd) until a folder named 'backend' is found.
    Returns the absolute path to the 'backend' folder.
    """
    if start_path is None:
        start_path = os.getcwd()
    curr_path = os.path.abspath(start_path)
    while True:
        # Check if 'backend' exists in this directory
        candidate = os.path.join(curr_path, "backend")
        if os.path.isdir(candidate):
            return candidate
        # If at filesystem root, stop
        parent = os.path.dirname(curr_path)
        if curr_path == parent:
            break
        curr_path = parent
    raise FileNotFoundError(f"No 'backend' directory found upward from {start_path}")

# Find the backend directory and CSV folder
backend_dir = find_backend_dir()
csv_dir = os.path.join(backend_dir, "CSVs")
models_dir = os.path.join(backend_dir, "Models")


In [3]:
train = pd.read_csv(csv_dir+"/train_season_features.csv")
test  = pd.read_csv(csv_dir+"/test_season_features.csv")

In [4]:
target_cols   = [c for c in train.columns if c.startswith("next_")]
feature_cols  = [c for c in train.columns if c not in ["PERSON_ID","SEASON_ID"] + target_cols]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    train[feature_cols], train[target_cols], test_size=0.2, random_state=42
)

In [6]:
X_test, y_test = test[feature_cols], test[target_cols]

In [7]:
# Replace all inf/-inf with NaN for ALL splits
for df_ in [X_train, X_val, y_train, y_val, X_test, y_test]:
    df_.replace([np.inf, -np.inf], np.nan, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_.replace([np.inf, -np.inf], np.nan, inplace=True)


In [8]:
imputer = SimpleImputer(strategy='median')

In [9]:
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns, index=X_val.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

y_train_imputed = pd.DataFrame(imputer.fit_transform(y_train), columns=y_train.columns, index=y_train.index)
y_val_imputed = pd.DataFrame(imputer.transform(y_val), columns=y_val.columns, index=y_val.index)
y_test_imputed = pd.DataFrame(imputer.transform(y_test), columns=y_test.columns, index=y_test.index)


In [10]:
ridge = MultiOutputRegressor(Ridge(alpha=1.0, random_state=42))
ridge.fit(X_train_imputed, y_train_imputed)
ridge_pred_val = ridge.predict(X_val_imputed)
print("Ridge Validation MAE:", round(mean_absolute_error(y_val_imputed, ridge_pred_val),3),
      "R2:", round(r2_score(y_val_imputed, ridge_pred_val),3))


Ridge Validation MAE: 1.927 R2: 0.692


In [11]:
enet = MultiOutputRegressor(ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42, max_iter=10000))
enet.fit(X_train_imputed, y_train_imputed)
enet_pred_val = enet.predict(X_val_imputed)
print("ElasticNet Validation MAE:", round(mean_absolute_error(y_val_imputed, enet_pred_val),3),
      "R2:", round(r2_score(y_val_imputed, enet_pred_val),3))

ElasticNet Validation MAE: 2.016 R2: 0.605


In [12]:
ridge_pred_test = ridge.predict(X_test_imputed)
print("Ridge Test MAE:", round(mean_absolute_error(y_test_imputed, ridge_pred_test),3),
      "R2:", round(r2_score(y_test_imputed, ridge_pred_test),3))

Ridge Test MAE: 2.027 R2: 0.66


In [13]:
enet_pred_test = enet.predict(X_test_imputed)
print("ElasticNet Test MAE:", round(mean_absolute_error(y_test_imputed, enet_pred_test),3),
      "R2:", round(r2_score(y_test_imputed, enet_pred_test),3))        

ElasticNet Test MAE: 2.087 R2: 0.572


In [14]:
param_dist = {"estimator__alpha": loguniform(1e-3, 1e3)}
search = RandomizedSearchCV(
    MultiOutputRegressor(Ridge(random_state=42)),
    param_dist, n_iter=30, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1, random_state=42
)
search.fit(X_train_imputed, y_train_imputed)
print("Best alpha:", search.best_params_)
best_ridge = search.best_estimator_


Best alpha: {'estimator__alpha': np.float64(98.77700294007911)}


In [15]:
# Rebuild Ridge with best α
best_ridge = MultiOutputRegressor(Ridge(alpha=98.77700294007911, random_state=42))
best_ridge.fit(X_train_imputed, y_train_imputed)

0,1,2
,estimator,Ridge(alpha=9...ndom_state=42)
,n_jobs,

0,1,2
,alpha,98.77700294007911
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [16]:
# Validation performance
val_pred = best_ridge.predict(X_val_imputed)
print("Tuned Ridge Val MAE:", round(mean_absolute_error(y_val_imputed, val_pred),3),
      "R2:", round(r2_score(y_val_imputed, val_pred),3))


Tuned Ridge Val MAE: 1.934 R2: 0.688


In [17]:

# Test performance
test_pred = best_ridge.predict(X_test_imputed)
print("Tuned Ridge Test MAE:", round(mean_absolute_error(y_test_imputed, test_pred),3),
      "R2:", round(r2_score(y_test_imputed, test_pred),3))

Tuned Ridge Test MAE: 2.026 R2: 0.657


In [18]:
# Extract coefficient matrix: shape (n_stats, n_features)
coefs = np.vstack([est.coef_ for est in best_ridge.estimators_])
# Wrap in DataFrame
coef_df = pd.DataFrame(coefs.T, index=feature_cols, columns=target_cols)

# Show top positive/negative drivers for each target
for target in target_cols:
    print(f"\nTop +ve coefficients for {target}:")
    print(coef_df[target].sort_values(ascending=False).head(10))
    print(f"Top -ve coefficients for {target}:")
    print(coef_df[target].sort_values().head(10))


Top +ve coefficients for next_Points:
IS_WIN              0.597786
GAME_EFFICIENCY     0.382867
DREB                0.300442
Points_10G_AVG      0.278911
Points              0.274020
SEASON_Autumn       0.228646
FGA                 0.217500
EXPERIENCE_YEARS    0.209826
Minutes_10G_AVG     0.192692
TO_10G_AVG          0.188936
Name: next_Points, dtype: float64
Top -ve coefficients for next_Points:
PF                        -0.349065
REB_10G_AVG               -0.267286
AGE                       -0.230859
OREB                      -0.224842
3PA                       -0.222320
Minutes                   -0.202214
GAME_EFFICIENCY_10G_AVG   -0.166952
BMI                       -0.156837
GAME_EFFICIENCY_5G_AVG    -0.143642
AST_5G_AVG                -0.130776
Name: next_Points, dtype: float64

Top +ve coefficients for next_FTM:
FTM                 0.368246
FTA                 0.262048
IS_WIN              0.100056
Points_10G_AVG      0.072979
GAME_EFFICIENCY     0.069585
DREB                0.06

In [20]:
joblib.dump(imputer, models_dir+"/median_imputer.joblib")
joblib.dump(best_ridge, models_dir+"/ridge_regression_season_model.joblib")

['/Users/jeevanparmar/Uni/MSE 436/Project-Mono-Repo/backend/Models/ridge_regression_season_model.joblib']