In [6]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

dfs = {}

# Define the columns to normalize
cols_to_normalize = ['G', 'PTS/36', 'T_PTS', 'TRB/36', 'T_TRB', 'AST/36', 'T_AST', 'STL/36', 'T_STL', 'BLK/36', 'T_BLK', 'TOV/36', 'T_TOV', 'TS%', 'PER', 'WS/36', 'T_WS', 'BPM', 'VORP', 'On_Off', 'TWR']

for year in range(2013, 2023):
    file_name = f"mvp{year}.csv"
    df_name = f"df_{year}"
    dfs[df_name] = pd.read_csv(file_name)

    # Read in the corresponding totals file 
    totals_file_name = f"mvp{year}totals.csv"
    totals_df = pd.read_csv(totals_file_name)
    
    # Merge the two dataframes on the "Player" column
    dfs[df_name] = pd.merge(dfs[df_name], totals_df, on="Player")
    
    # Create a MinMaxScaler object
    scaler = MinMaxScaler()
    
    # Normalize the selected columns
    dfs[df_name][cols_to_normalize] = scaler.fit_transform(dfs[df_name][cols_to_normalize])
    dfs[df_name][cols_to_normalize] = dfs[df_name][cols_to_normalize].apply(lambda x: round(x, 2))
    
# Read in the CSV file 
mvp_candidates = pd.read_csv("mvp_candidates.csv")

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Normalize the selected columns
mvp_candidates[cols_to_normalize] = scaler.fit_transform(mvp_candidates[cols_to_normalize])
mvp_candidates[cols_to_normalize] = mvp_candidates[cols_to_normalize].apply(lambda x: round(x, 2))

# Concatenate all the previous years' dataframes
previous_years = pd.concat(dfs.values())
print(previous_years)

# Define the list of features and their names
features_dict = { 'All': ['G', 'PTS/36', 'T_PTS', 'TRB/36', 'T_TRB', 'AST/36', 'T_AST', 'STL/36', 'T_STL', 'BLK/36', 'T_BLK', 'TOV/36', 'T_TOV', 'TS%', 'PER', 'WS/36', 'T_WS', 'BPM', 'VORP', 'On_Off', 'TWR'],
                 'All_without_G': ['PTS/36', 'T_PTS', 'TRB/36', 'T_TRB', 'AST/36', 'T_AST', 'STL/36', 'T_STL', 'BLK/36', 'T_BLK', 'TOV/36', 'T_TOV', 'TS%', 'PER', 'WS/36', 'T_WS', 'BPM', 'VORP', 'On_Off', 'TWR'],
                 'Per36_with_AS': ['G', 'PTS/36', 'TRB/36', 'AST/36', 'STL/36', 'BLK/36', 'TOV/36', 'TS%', 'PER', 'WS/36', 'BPM', 'VORP', 'On_Off', 'TWR'],
                 'Totals_with_AS': ['T_PTS', 'T_TRB', 'T_AST', 'T_STL', 'T_BLK', 'T_TOV', 'TS%', 'PER', 'T_WS', 'BPM', 'VORP', 'On_Off', 'TWR'],
                 'Per36': ['G', 'PTS/36', 'TRB/36', 'AST/36', 'STL/36', 'BLK/36', 'TOV/36', 'TS%', 'TWR'],
                 'AS': ['TS%', 'PER', 'T_WS', 'BPM', 'VORP', 'On_Off', 'TWR'] } 

# Define the list of models to run
# Define the list of models
models_dict = { 'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
               'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42), 
               'XGBoost': XGBRegressor(n_estimators=100, max_depth=5, random_state=42), 
               'Linear Regression': LinearRegression(), 
               'Lasso': Lasso(alpha=0.01, max_iter=10000, random_state=42), 
               'Ridge Regression': Ridge(alpha=0.1, random_state=42), 
               'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42), 
               'k-NN': KNeighborsRegressor(n_neighbors=5), 
               'SVR': SVR(kernel='rbf', C=1, gamma='scale') } 

# Define an empty dataframe to store the predictions
pred_df = pd.DataFrame({'Player': mvp_candidates['Player']})

# Train and predict using each model
for name, model in models_dict.items(): 
    for features_name, features in features_dict.items(): 
        # Split the data into training and testing sets 
        X_train, X_test, y_train, y_test = train_test_split(previous_years[features], previous_years["VPS"], test_size=0.2, random_state=42)
        
        # Train the model 
        model.fit(X_train, y_train)
        
        # Make predictions on the mvp_candidates dataframe
        predictions = model.predict(mvp_candidates[features])
        pred_df[f"{name}_{features_name}"] = predictions.round(3) 
        
        # Compute the mean squared error and R-squared 
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Print the performance metrics
        print(f"Performance with {name} and {features_name}:")
        print(f"MSE: {mse:.3f}")
        print(f"R-squared: {r2:.3f}")
    print()
        
# Sort the candidates by predicted voting score for each list of features and each model
for name in models_dict.keys():
    for features_name in features_dict.keys():
        pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
        mvp_candidates[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}_rank']
        mvp_candidates.sort_values(by=f"{name}_{features_name}_rank", ascending=True, inplace=True)
        
# Print the predicted ranking for each list of features and each model
print(f"Predicted MVP ranking with {name} and {features_name}:")
for j, player in enumerate(mvp_candidates["Player"]):
    print(f"{j+1}. {player}")
print() 

pred_df 

             Player     G  PTS/36  TRB/36  AST/36  STL/36  BLK/36  TOV/36   
0      LeBron James  0.62    0.75    0.51    0.55    0.44    0.26    0.27  \
1      Kevin Durant  0.94    0.84    0.48    0.22    0.28    0.35    0.73   
2   Carmelo Anthony  0.06    1.00    0.40    0.00    0.00    0.13    0.00   
3        Chris Paul  0.25    0.00    0.08    1.00    1.00    0.00    0.00   
4       Kobe Bryant  0.75    0.75    0.22    0.39    0.28    0.06    0.91   
..              ...   ...     ...     ...     ...     ...     ...     ...   
5      Jayson Tatum  1.00    0.66    0.32    0.00    0.08    0.23    0.23   
6         Ja Morant  0.00    0.83    0.14    0.39    0.33    0.08    0.59   
7     Stephen Curry  0.37    0.63    0.06    0.30    0.42    0.08    0.41   
8        Chris Paul  0.42    0.00    0.00    1.00    1.00    0.00    0.09   
9     DeMar DeRozan  1.00    0.70    0.04    0.07    0.00    0.00    0.00   

     TS%   PER  ...  On_Off   TWR    VPS  T_PTS  T_TRB  T_AST  T_STL  T_BLK

  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)
  pred_df[f'{name}_{features_name}_rank'] = pred_df[f'{name}_{features_name}'].rank(ascending=False)


Unnamed: 0,Player,Random Forest_All,Random Forest_All_without_G,Random Forest_Per36_with_AS,Random Forest_Totals_with_AS,Random Forest_Per36,Random Forest_AS,Gradient Boosting_All,Gradient Boosting_All_without_G,Gradient Boosting_Per36_with_AS,...,k-NN_Per36_with_AS_rank,k-NN_Totals_with_AS_rank,k-NN_Per36_rank,k-NN_AS_rank,SVR_All_rank,SVR_All_without_G_rank,SVR_Per36_with_AS_rank,SVR_Totals_with_AS_rank,SVR_Per36_rank,SVR_AS_rank
0,Nikola Jokić,0.736,0.744,0.755,0.768,0.598,0.78,0.873,0.878,0.925,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Joel Embiid,0.526,0.517,0.534,0.492,0.576,0.44,0.63,0.608,0.647,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,Giannis Antetokounmpo,0.328,0.323,0.348,0.169,0.454,0.14,0.443,0.37,0.358,...,3.0,3.0,4.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0
3,Jayson Tatum,0.048,0.044,0.054,0.107,0.078,0.154,0.078,0.087,0.047,...,7.0,6.0,6.0,4.0,7.0,6.0,9.0,5.0,5.0,6.0
4,James Harden,0.03,0.03,0.027,0.039,0.133,0.04,0.015,0.023,0.012,...,10.0,10.0,5.0,7.0,8.0,8.0,8.0,9.0,4.0,7.0
5,Domantas Sabonis,0.123,0.144,0.05,0.147,0.145,0.147,0.082,0.156,0.049,...,8.0,4.5,9.0,8.0,10.0,9.0,10.0,8.0,9.0,8.0
6,Luka Dončić,0.202,0.212,0.215,0.214,0.281,0.18,0.093,0.123,0.091,...,5.0,7.0,8.0,3.0,6.0,7.0,5.0,7.0,6.0,5.0
7,Jimmy Butler,0.131,0.123,0.109,0.143,0.113,0.169,0.082,0.163,0.161,...,4.0,4.5,7.0,6.0,4.0,4.0,4.0,4.0,8.0,4.0
8,Shai Gilgeous-Alexander,0.08,0.079,0.086,0.064,0.268,0.062,0.037,0.016,0.022,...,6.0,8.0,3.0,9.0,5.0,5.0,7.0,6.0,7.0,9.0
9,Damian Lillard,0.056,0.059,0.067,0.058,0.144,0.038,0.07,0.071,0.046,...,9.0,9.0,10.0,10.0,9.0,10.0,6.0,10.0,10.0,10.0
