In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from nba_api.stats.endpoints import leagueleaders
import numpy as np
from sklearn.svm import SVC





### Gradient Boosting ### 


In [60]:
FEATURES = ['BLK', 'STL', 'DREB', 'GP', 'MIN', 
           'BLK_PER_36', 'STL_PER_36', 'DREB_PER_36', 'DEF_SCORE']

actual_nominees = {
    '2023-24': ['Rudy Gobert', 'Bam Adebayo', 'Victor Wembanyama'],
    '2022-23': ['Jaren Jackson Jr.', 'Brook Lopez', 'Evan Mobley'],
    '2021-22': ['Marcus Smart', 'Mikal Bridges', 'Rudy Gobert'],
    '2020-21': ['Rudy Gobert', 'Ben Simmons', 'Draymond Green'],
    '2019-20': ['Giannis Antetokounmpo', 'Anthony Davis', 'Rudy Gobert'],
    '2018-19': ['Rudy Gobert', 'Paul George', 'Giannis Antetokounmpo'],
    '2017-18': ['Rudy Gobert', 'Joel Embiid', 'Anthony Davis'],
    '2016-17': ['Draymond Green', 'Rudy Gobert', 'Kawhi Leonard'],
    '2015-16': ['Kawhi Leonard', 'Draymond Green', 'Hassan Whiteside'],
    '2014-15': ['Kawhi Leonard', 'Draymond Green', 'DeAndre Jordan']
}

def get_season_data(season):
    blocks = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='BLK',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    steals = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='STL',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    def_reb = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='DREB',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    blocks = blocks[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    steals = steals[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    def_reb = def_reb[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    
    all_players = pd.concat([blocks, steals, def_reb]).drop_duplicates(subset=['PLAYER'])
    
    for stat in ['BLK', 'STL', 'DREB']:
        all_players[f'{stat}_PER_36'] = all_players[stat] * 36 / all_players['MIN']
    
    all_players['DEF_SCORE'] = (
        all_players['BLK'] * 2 + 
        all_players['STL'] * 2 + 
        all_players['DREB'] * 0.5
    ) * (all_players['GP'] / 82)
    
    all_players['ACTUAL_NOMINEE'] = all_players['PLAYER'].isin(actual_nominees.get(season, []))
    
    return all_players


In [61]:
def prepare_full_data():
    all_seasons_data = []
    for season in actual_nominees.keys():
        season_data = get_season_data(season)
        season_data['SEASON'] = season
        all_seasons_data.append(season_data)
    return pd.concat(all_seasons_data)

In [62]:


def evaluate_model(full_data):
    print("\n--- Model Evaluation: Gradient Boosting ---")
    
    X = full_data[FEATURES]
    y = full_data['ACTUAL_NOMINEE']
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("\nGradient Boosting Classifier:")
    gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    y_proba_gb = gb_model.predict_proba(X_test)[:, 1]
    
    print("Classification Report (Gradient Boosting):")
    print(classification_report(y_test, y_pred_gb))
    print(f"ROC-AUC Score (Gradient Boosting): {roc_auc_score(y_test, y_proba_gb):.3f}")
    
    feature_importance = pd.DataFrame({
        'Feature': FEATURES,
        'Importance': gb_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance (Gradient Boosting):")
    print(feature_importance)
    
    return gb_model

if __name__ == "__main__":
    full_data = prepare_full_data()
    print(f"Prepared data with {len(full_data)} rows.")
    
    gb_model = evaluate_model(full_data)

Prepared data with 1066 rows.

--- Model Evaluation: Gradient Boosting ---

Gradient Boosting Classifier:
Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

       False       0.96      0.98      0.97       206
        True       0.00      0.00      0.00         8

    accuracy                           0.94       214
   macro avg       0.48      0.49      0.49       214
weighted avg       0.93      0.94      0.93       214

ROC-AUC Score (Gradient Boosting): 0.879

Feature Importance (Gradient Boosting):
       Feature  Importance
8    DEF_SCORE    0.338538
0          BLK    0.116992
3           GP    0.102354
6   STL_PER_36    0.097002
7  DREB_PER_36    0.080040
1          STL    0.079829
4          MIN    0.066466
5   BLK_PER_36    0.062675
2         DREB    0.056103


### Gradient Boosting Era DPOY Predictions ###


In [63]:
FEATURES = ['BLK', 'STL', 'DREB', 'GP', 'MIN', 
           'BLK_PER_36', 'STL_PER_36', 'DREB_PER_36', 'DEF_SCORE']

def get_season_data(season):
    blocks = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='BLK',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    steals = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='STL',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    def_reb = leagueleaders.LeagueLeaders(
        season=season,
        stat_category_abbreviation='DREB',
        per_mode48='PerGame'
    ).get_data_frames()[0]
    
    blocks = blocks[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    steals = steals[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    def_reb = def_reb[['PLAYER', 'TEAM', 'GP', 'MIN', 'BLK', 'DREB', 'STL']].head(50)
    
    all_players = pd.concat([blocks, steals, def_reb]).drop_duplicates(subset=['PLAYER'])
    
    for stat in ['BLK', 'STL', 'DREB']:
        all_players[f'{stat}_PER_36'] = all_players[stat] * 36 / all_players['MIN']
    
    all_players['DEF_SCORE'] = (
        all_players['BLK'] * 2 + 
        all_players['STL'] * 2 + 
        all_players['DREB'] * 0.5
    ) * (all_players['GP'] / 82)
    
    return all_players

In [64]:
def prepare_interval_data(start_year, end_year):
    all_seasons_data = []
    for year in range(start_year, end_year + 1):
        try:
            season = f"{year}-{str(year+1)[-2:]}"
            season_data = get_season_data(season)
            season_data['SEASON'] = season
            all_seasons_data.append(season_data)
        except Exception as e:
            print(f"Error fetching data for season {season}: {e}")
    return pd.concat(all_seasons_data)

In [65]:

def train_and_predict_for_interval(interval_name, data_interval, data_2023_24):
    print(f"\n{interval_name} Data ---")
    
    X = data_interval[FEATURES]
    y = (data_interval['DEF_SCORE'] >= data_interval['DEF_SCORE'].quantile(0.95)).astype(int) 
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    X_2023_24 = scaler.transform(data_2023_24[FEATURES])
    data_2023_24['PREDICTED_PROBABILITY'] = model.predict_proba(X_2023_24)[:, 1]
    
    top_5_candidates = data_2023_24.nlargest(5, 'PREDICTED_PROBABILITY')
    print(f"Top 5 Predicted DPOY Candidates for 2023-24 Season ({interval_name} Data):")
    print(top_5_candidates[['PLAYER', 'TEAM', 'PREDICTED_PROBABILITY', 'DEF_SCORE']])
    
    return top_5_candidates

if __name__ == "__main__":
    data_1960_1980 = prepare_interval_data(1960, 1980)
    data_1980_2000 = prepare_interval_data(1980, 2000)
    data_2000_2020 = prepare_interval_data(2000, 2020)
    
    data_2023_24 = prepare_interval_data(2023, 2024)
    
    top_5_candidates_1960_1980 = train_and_predict_for_interval("1960-1980", data_1960_1980, data_2023_24)
    top_5_candidates_1980_2000 = train_and_predict_for_interval("1980-2000", data_1980_2000, data_2023_24)
    top_5_candidates_2000_2020 = train_and_predict_for_interval("2000-2020", data_2000_2020, data_2023_24)







  return pd.concat(all_seasons_data)



1960-1980 Data ---
Top 5 Predicted DPOY Candidates for 2023-24 Season (1960-1980 Data):
              PLAYER TEAM  PREDICTED_PROBABILITY  DEF_SCORE
0  Victor Wembanyama  SAS               0.999966  11.948780
3      Anthony Davis  LAL               0.999966  10.890244
1     Walker Kessler  UTA               0.000002   6.439024
2        Brook Lopez  MIL               0.000002   7.370122
4      Chet Holmgren  OKC               0.000002   8.950000

1980-2000 Data ---
Top 5 Predicted DPOY Candidates for 2023-24 Season (1980-2000 Data):
              PLAYER TEAM  PREDICTED_PROBABILITY  DEF_SCORE
0  Victor Wembanyama  SAS               0.999966  11.948780
3      Anthony Davis  LAL               0.999966  10.890244
1     Walker Kessler  UTA               0.000002   6.439024
2        Brook Lopez  MIL               0.000002   7.370122
4      Chet Holmgren  OKC               0.000002   8.950000

2000-2020 Data ---
Top 5 Predicted DPOY Candidates for 2023-24 Season (2000-2020 Data):
             

In [67]:
def train_and_predict_for_interval(interval_name, data_interval, data_2023_24):
    print(f"\n{interval_name} Data ---")
    
    X = data_interval[FEATURES]
    y = (data_interval['DEF_SCORE'] >= data_interval['DEF_SCORE'].quantile(0.95)).astype(int) 
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)




#Balanced to ensure for amy imblance within dataset
    lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    lr_model.fit(X, y)
    


#Normalize features so that each feature has mean of 0 and std of 1. 
    X_2023_24 = scaler.transform(data_2023_24[FEATURES])
    data_2023_24['LR_PROBABILITY'] = lr_model.predict_proba(X_2023_24)[:, 1]
    




    top_5_lr = data_2023_24.nlargest(5, 'LR_PROBABILITY')
    print(f"\nTop 5 Predicted DPOY Candidates for 2023-24 Season ({interval_name} - Logistic Regression):")
    print(top_5_lr[['PLAYER', 'TEAM', 'LR_PROBABILITY', 'DEF_SCORE']])
    
    return top_5_lr





if __name__ == "__main__":
    data_1960_1980 = prepare_interval_data(1960, 1980)
    data_1980_2000 = prepare_interval_data(1980, 2000)
    data_2000_2020 = prepare_interval_data(2000, 2020)
    
    data_2023_24 = prepare_interval_data(2023, 2024)
    
    print("\n--- Predictions for 1960-1980 Era ---")
    top_5_lr_1960_1980 = train_and_predict_for_interval("1960-1980", data_1960_1980, data_2023_24)
    
    print("\n--- Predictions for 1980-2000 Era ---")
    top_5_lr_1980_2000 = train_and_predict_for_interval("1980-2000", data_1980_2000, data_2023_24)
    
    print("\n--- Predictions for 2000-2020 Era ---")
    top_5_lr_2000_2020 = train_and_predict_for_interval("2000-2020", data_2000_2020, data_2023_24)


  return pd.concat(all_seasons_data)



--- Predictions for 1960-1980 Era ---

1960-1980 Data ---

Top 5 Predicted DPOY Candidates for 2023-24 Season (1960-1980 - Logistic Regression):
               PLAYER TEAM  LR_PROBABILITY  DEF_SCORE
0   Victor Wembanyama  SAS        0.999887  11.948780
3       Anthony Davis  LAL        0.977476  10.890244
5         Rudy Gobert  MIN        0.365491   9.453659
4       Chet Holmgren  OKC        0.106773   8.950000
37       Nikola Jokić  DEN        0.031264   9.007927

--- Predictions for 1980-2000 Era ---

1980-2000 Data ---

Top 5 Predicted DPOY Candidates for 2023-24 Season (1980-2000 - Logistic Regression):
               PLAYER TEAM  LR_PROBABILITY  DEF_SCORE
0   Victor Wembanyama  SAS        1.000000  11.948780
3       Anthony Davis  LAL        0.999865  10.890244
5         Rudy Gobert  MIN        0.848058   9.453659
37       Nikola Jokić  DEN        0.565917   9.007927
4       Chet Holmgren  OKC        0.214815   8.950000

--- Predictions for 2000-2020 Era ---

2000-2020 Data ---



### Neural Networks (Incomplete) ###

In [3]:

FEATURES = ['BLK', 'STL', 'DREB', 'GP', 'MIN', 
            'BLK_PER_36', 'STL_PER_36', 'DREB_PER_36', 'DEF_SCORE']

def prepare_data(data):
    X = data[FEATURES]
    y = (data['DEF_SCORE'] >= data['DEF_SCORE'].quantile(0.95)).astype(int)  # Top 5% as DPOY candidates

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

   
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, scaler

def train_and_evaluate_svm(data):
    X_train, X_test, y_train, y_test, scaler = prepare_data(data)

    svm_model = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    y_proba = svm_model.predict_proba(X_test)[:, 1] 

    
