In [1]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leagueleaders  # Import leagueleaders

In [2]:
# Features relevant to Most Improved Player (MIP)
FEATURES = ['PTS_DIFF', 'REB_DIFF', 'AST_DIFF', 'MIN_DIFF', 'EFF_DIFF']

# Historical MIP winners for validation
actual_mip_winners = {
    '2022-23': 'Lauri Markkanen',
    '2021-22': 'Ja Morant',
    '2020-21': 'Julius Randle',
    '2019-20': 'Brandon Ingram',
    '2018-19': 'Pascal Siakam',
    '2017-18': 'Victor Oladipo',
    '2016-17': 'Giannis Antetokounmpo',
    '2015-16': 'CJ McCollum',
    '2014-15': 'Jimmy Butler'
}

def get_improvement_data(prev_season, curr_season):
    prev_stats = leagueleaders.LeagueLeaders(
        season=prev_season, stat_category_abbreviation='PTS', per_mode48='PerGame'
    ).get_data_frames()[0]

    curr_stats = leagueleaders.LeagueLeaders(
        season=curr_season, stat_category_abbreviation='PTS', per_mode48='PerGame'
    ).get_data_frames()[0]

    merged = pd.merge(
        prev_stats[['PLAYER', 'PTS', 'REB', 'AST', 'MIN', 'EFF']],
        curr_stats[['PLAYER', 'PTS', 'REB', 'AST', 'MIN', 'EFF']],
        on='PLAYER',
        suffixes=('_PREV', '_CURR')
    )

    for stat in ['PTS', 'REB', 'AST', 'MIN', 'EFF']:
        merged[f'{stat}_DIFF'] = merged[f'{stat}_CURR'] - merged[f'{stat}_PREV']

    merged = merged[~merged['PLAYER'].str.contains("Starter|All-Star", case=False, na=False)]

    curr_year = curr_season.split('-')[0]
    merged['ACTUAL_MIP'] = merged['PLAYER'] == actual_mip_winners.get(curr_year, '')

    return merged

In [None]:

def prepare_mip_data(start_year, end_year):
    all_seasons_data = []
    for year in range(start_year, end_year):
        try:
            prev_season = f"{year}-{str(year+1)[-2:]}"
            curr_season = f"{year+1}-{str(year+2)[-2:]}"
            season_data = get_improvement_data(prev_season, curr_season)
            season_data['SEASON'] = curr_season
            all_seasons_data.append(season_data)
        except Exception as e:
            print(f"Error fetching data for {prev_season} to {curr_season}: {e}")
    return pd.concat(all_seasons_data)




In [4]:
def train_dummy_model(data):
    print("\n--- Dummy Regressor for Most Improved Player ---")

    X = data[FEATURES]
    y = data['ACTUAL_MIP'].astype(int)  

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dummy_model = DummyRegressor(strategy='mean')
    dummy_model.fit(X_train, y_train)

    y_pred = dummy_model.predict(X_test)

    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.3f}")
    print(f"R2 Score: {r2_score(y_test, y_pred):.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, (y_pred > 0.5).astype(int)))

    if '2022-23' in data['SEASON'].values:
        mip_2023_data = data[data['SEASON'] == '2022-23']
        mip_2023_data['PREDICTED_SCORE'] = dummy_model.predict(scaler.transform(mip_2023_data[FEATURES]))
        top_candidates = mip_2023_data.nlargest(5, 'PREDICTED_SCORE')
        print("\nTop 5 Predicted Most Improved Player Candidates for 2023:")
        print(top_candidates[['PLAYER', 'PREDICTED_SCORE']])

    return dummy_model

if __name__ == "__main__":
    mip_data = prepare_mip_data(2014, 2022)
    print(f"Prepared data with {len(mip_data)} rows.")

    dummy_model = train_dummy_model(mip_data)

Prepared data with 1345 rows.

--- Dummy Regressor for Most Improved Player ---
Mean Absolute Error: 0.000
R2 Score: 1.000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       269

    accuracy                           1.00       269
   macro avg       1.00      1.00      1.00       269
weighted avg       1.00      1.00      1.00       269


Top 5 Predicted Most Improved Player Candidates for 2023:
                  PLAYER  PREDICTED_SCORE
0            Joel Embiid              0.0
1  Giannis Antetokounmpo              0.0
2            Luka Dončić              0.0
3             Trae Young              0.0
4          DeMar DeRozan              0.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mip_2023_data['PREDICTED_SCORE'] = dummy_model.predict(scaler.transform(mip_2023_data[FEATURES]))


Random Forest Attempt (Incomplete)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leagueleaders 

FEATURES = ['PTS_DIFF', 'REB_DIFF', 'AST_DIFF', 'MIN_DIFF', 'EFF_DIFF']

actual_mip_winners = {
    '2022-23': 'Lauri Markkanen',
    '2021-22': 'Ja Morant',
    '2020-21': 'Julius Randle',
    '2019-20': 'Brandon Ingram',
    '2018-19': 'Pascal Siakam',
    '2017-18': 'Victor Oladipo',
    '2016-17': 'Giannis Antetokounmpo',
    '2015-16': 'CJ McCollum',
    '2014-15': 'Jimmy Butler'
}

def get_improvement_data(prev_season, curr_season):
    prev_stats = leagueleaders.LeagueLeaders(
        season=prev_season, stat_category_abbreviation='PTS', per_mode48='PerGame'
    ).get_data_frames()[0]

    curr_stats = leagueleaders.LeagueLeaders(
        season=curr_season, stat_category_abbreviation='PTS', per_mode48='PerGame'
    ).get_data_frames()[0]

    merged = pd.merge(
        prev_stats[['PLAYER', 'PTS', 'REB', 'AST', 'MIN', 'EFF']],
        curr_stats[['PLAYER', 'PTS', 'REB', 'AST', 'MIN', 'EFF']],
        on='PLAYER',
        suffixes=('_PREV', '_CURR')
    )

    for stat in ['PTS', 'REB', 'AST', 'MIN', 'EFF']:
        merged[f'{stat}_DIFF'] = merged[f'{stat}_CURR'] - merged[f'{stat}_PREV']

    merged = merged[~merged['PLAYER'].str.contains("Starter|All-Star", case=False, na=False)]

    curr_year = curr_season.split('-')[0]
    merged['ACTUAL_MIP'] = merged['PLAYER'] == actual_mip_winners.get(curr_year, '')

    return merged

In [None]:
def prepare_mip_data(start_year, end_year):
    all_seasons_data = []
    for year in range(start_year, end_year):
        try:
            prev_season = f"{year}-{str(year+1)[-2:]}"
            curr_season = f"{year+1}-{str(year+2)[-2:]}"
            season_data = get_improvement_data(prev_season, curr_season)
            season_data['SEASON'] = curr_season
            all_seasons_data.append(season_data)
        except Exception as e:
            print(f"Error fetching data for {prev_season} to {curr_season}: {e}")
    return pd.concat(all_seasons_data)


In [9]:




def train_random_forest_model(data):
    print("\n--- Random Forest for Most Improved Player ---")

    X = data[FEATURES]
    y = data['ACTUAL_MIP'].astype(int)  

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    print("\nClass Distribution:")
    print(y.value_counts())

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    if len(np.unique(y_train)) < 2:
        print("Warning: Only one class present in training data. Random Forest requires both classes to train properly.")
        return None

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)
    y_proba = rf_model.predict_proba(X_test)[:, 1]

  




Prepared data with 1345 rows.

--- Random Forest for Most Improved Player ---

Class Distribution:
ACTUAL_MIP
0    1345
Name: count, dtype: int64
