In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
# 1) Load
df = pd.read_csv('/content/sample_data/merged_nba_data_all_seasons.csv')

# 2) Split by season
train_df = df[df['season'] <= 2024]    # train on only past seasons
pred_df  = df[df['season'] == 2025]    # prediction based on “future” rows

# 3) Train/test on train_df (2014-2024)

feature_cols = [
    'seed',
    'win_pct',
    'off_rtg',
    'def_rtg',
    'net_rtg',
    'srs',
    'total_playoff_games',
    'player1vorp',
    'player2vorp',
    'player3vorp',
    'player4vorp',
    'player5vorp',
    'player6vorp',
    'player7vorp',
    'player8vorp',
    'player9vorp'
]

X = train_df[feature_cols]
y = train_df['champ_scr'].fillna(0)

# Stratified split on past seasons
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=train_df['season']
)

# 4) Data standardization
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_s, y_train)

# 5) Evaluate on historical test set
y_test_pred = reg.predict(X_test_s)
mse  = mean_squared_error(y_test, y_test_pred)
rmse = mse ** 0.5
r2   = r2_score(y_test, y_test_pred)
print(f"Historical test RMSE: {rmse:.3f}, R²: {r2:.3f}")

# 6) Finally-- predict 2025
X_2025      = pred_df[feature_cols]
X_2025_s    = scaler.transform(X_2025)
pred_scores = reg.predict(X_2025_s)

# Percentage value conversion
total = pred_scores.sum()
pred_pct = pred_scores / total * 100
pred_df['champion_likelihood_%'] = pred_pct

# 7) Sort & display
pred_df.sort_values('champion_likelihood_%', ascending=False)[
    ['team','champion_likelihood_%']
]

Historical test RMSE: 0.140, R²: 0.712


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['champion_likelihood_%'] = pred_pct


Unnamed: 0,team,champion_likelihood_%
331,Boston Celtics,12.108041
350,Oklahoma City Thunder,10.462589
335,Cleveland Cavaliers,9.096554
340,Houston Rockets,7.978889
343,Los Angeles Lakers,7.326917
342,Los Angeles Clippers,7.202732
337,Denver Nuggets,6.426576
346,Milwaukee Bucks,5.898789
341,Indiana Pacers,5.867743
349,New York Knicks,5.55728
