In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import time
import re
from ipykernel import kernelapp as app
import seaborn as sns

In [2]:
## Read new csv and save as DF
stats = pd.read_csv('./Data/nba_stats_mvp.csv').drop(columns='Unnamed: 0')
pd.set_option('display.max_columns', None)
stats

Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,season,player,win_pct,votes_first,points_won,points_max,award_share,g,mp_per_g,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48,Mvp?
0,18.6,0.2,6.5,25.1,0.572,28.4,8.0,1980-81,Julius Erving,0.756098,28.0,454.0,690.0,0.658,82,35.0,24.6,8.0,4.4,2.1,1.8,0.521,0.222,0.787,13.8,0.231,Yes
1,18.3,0.9,4.0,19.9,0.528,24.3,5.1,1980-81,Larry Bird,0.756098,20.0,423.0,690.0,0.613,82,39.5,21.2,10.9,5.5,2.0,0.8,0.478,0.270,0.863,10.8,0.160,No
2,18.2,0.0,6.9,25.5,0.616,26.3,5.3,1980-81,Kareem Abdul-Jabbar,0.658537,8.0,286.0,690.0,0.414,80,37.2,26.2,10.3,3.4,0.7,2.9,0.574,0.000,0.766,14.3,0.230,No
3,19.3,0.0,10.1,25.1,0.585,27.6,3.7,1980-81,Moses Malone,0.487805,8.0,180.0,690.0,0.261,80,40.6,27.8,14.8,1.8,1.0,1.9,0.522,0.333,0.757,13.7,0.202,No
4,21.1,0.4,7.6,22.9,0.555,32.3,1.6,1980-81,George Gervin,0.634146,1.0,83.0,690.0,0.120,82,33.7,27.1,5.1,3.2,1.1,0.7,0.492,0.257,0.826,10.5,0.182,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,18.0,1.2,5.3,25.0,0.570,29.1,3.3,2017-18,LaMarcus Aldridge,0.573171,0.0,6.0,1010.0,0.006,75,33.5,23.1,8.5,2.0,0.6,1.2,0.510,0.293,0.837,10.9,0.209,No
633,15.6,3.4,7.2,23.7,0.590,24.9,5.0,2017-18,Jimmy Butler,0.573171,0.0,5.0,1010.0,0.005,59,36.7,22.2,5.3,4.9,2.0,0.4,0.474,0.350,0.854,8.9,0.198,No
634,16.9,9.8,5.9,28.2,0.675,31.0,8.6,2017-18,Stephen Curry,0.707317,0.0,5.0,1010.0,0.005,51,32.0,26.4,5.1,6.1,1.6,0.2,0.495,0.423,0.921,9.1,0.267,No
635,16.8,3.4,7.4,22.9,0.573,33.4,2.6,2017-18,Joel Embiid,0.634146,0.0,4.0,1010.0,0.004,63,30.3,22.9,11.0,3.2,0.6,1.8,0.483,0.308,0.769,6.2,0.155,No


In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the list of training seasons
training_seasons = ['1980-81', '1981-82', '1984-85', '1998-99',
                    '1990-91', '1997-98', '2001-02', '1985-86', '2000-01',
                    '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
                    '1987-88', '2013-14', '1999-00', '2004-05', '2003-04',
                    '2011-12', '2009-10', '1983-84', '1989-90', '1992-93', '2010-11',
                    '2017-18', '2010-11', '2002-03', '2008-09',
                    '2016-17']

# Define the list of testing seasons
testing_seasons = ['2012-13', '1994-95', '2014-15', '1988-89', '1996-97', '2015-16', '2005-06', '1982-83']

# Filter the data to only include the training seasons
training_data = stats[stats['season'].isin(training_seasons)]

# Define the features to use in the model
features = ['ts_pct', 'bpm', 'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'ws', 'win_pct']

# Split the training data into feature and target data
train_X = training_data[features]
train_y = training_data['award_share']

# Filter the data to only include the testing seasons
testing_data = stats[stats['season'].isin(testing_seasons)]

# Split the testing data into feature and target data
val_X = testing_data[features]
val_y = testing_data['award_share']

# Train a gradient boosting regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(train_X, train_y)

# Make predictions on the validation set
predictions = gb_model.predict(val_X)


In [6]:
# Create a pandas dataframe from the validation features (val_X)
df = pd.DataFrame(val_X, columns=['ts_pct', 'bpm', 'mp_per_g','pts_per_g','trb_per_g','ast_per_g','stl_per_g','blk_per_g','ws','win_pct'])

# Add the predictions to the dataframe
df['prediction'] = predictions

# Add the validation labels (val_y) as the 'award_share' column in the dataframe
df['award_share'] = val_y

# Add the season and player information from the 'stats' dataframe to the new dataframe
df['season'] = [stats['season'][index] for index in df.reset_index()['index']]
df['player'] = [stats['player'][index] for index in df.reset_index()['index']]
df['Mvp?'] = [stats['Mvp?'][index] for index in df.reset_index()['index']]

# Reorder the columns in the dataframe
df = df[['ts_pct', 'bpm', 'mp_per_g','pts_per_g','trb_per_g','ast_per_g','stl_per_g','blk_per_g','ws','win_pct',
         'player','season','award_share','Mvp?','prediction']]


# Add a new column 'mvp_prediction' to the dataframe, initialized to 'No' for all rows
df['mvp_prediction'] = 'No'

# Group the dataframe by season and find the player with the maximum prediction in each season
season_group = df.groupby('season')
max_prediction_idx = season_group['prediction'].idxmax()

# Update the 'mvp_prediction' value for the player with the highest prediction in each season to 'Yes'
df.loc[max_prediction_idx, 'mvp_prediction'] = 'Yes'

# Display the entire dataframe
pd.set_option('display.max_rows', None)
df

Unnamed: 0,ts_pct,bpm,mp_per_g,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,ws,win_pct,player,season,award_share,Mvp?,prediction,mvp_prediction
56,0.578,3.4,37.5,24.5,15.3,1.3,1.1,2.0,15.1,0.792683,Moses Malone,1982-83,0.96,Yes,0.460014,Yes
57,0.561,7.6,37.7,23.6,11.0,5.8,1.9,0.9,14.0,0.682927,Larry Bird,1982-83,0.485,No,0.242594,No
58,0.603,7.4,36.8,16.8,8.6,10.5,2.2,0.6,12.5,0.707317,Magic Johnson,1982-83,0.406,No,0.161415,No
59,0.602,5.7,35.7,22.5,5.8,3.9,1.5,0.3,13.2,0.621951,Sidney Moncrief,1982-83,0.301,No,0.062198,No
60,0.566,6.3,33.6,21.4,6.8,3.7,1.6,1.8,10.9,0.792683,Julius Erving,1982-83,0.149,No,0.16425,No
61,0.561,4.7,36.4,28.4,7.3,4.8,1.4,1.5,10.3,0.54878,Alex English,1982-83,0.056,No,0.069831,No
62,0.611,3.3,36.1,17.0,12.5,1.5,1.1,1.3,11.4,0.597561,Buck Williams,1982-83,0.053,No,0.009419,No
63,0.668,3.6,34.1,18.0,12.0,1.5,0.5,2.3,11.0,0.646341,Artis Gilmore,1982-83,0.044,No,0.098931,No
64,0.561,1.1,36.3,26.2,4.6,3.4,1.1,0.9,9.4,0.646341,George Gervin,1982-83,0.04,No,0.089481,No
65,0.619,3.8,32.3,21.8,7.5,2.5,0.8,2.2,10.9,0.707317,Kareem Abdul-Jabbar,1982-83,0.02,No,0.145963,No


In [7]:
## 5/8 correct. 62.5% accurate

In [8]:
# Calculate the mean squared error
mse = mean_squared_error(val_y, predictions)
print("Mean Squared Error:", mse)

# Calculate the mean absolute error
mae = mean_absolute_error(val_y, predictions)
print("Mean Absolute Error:", mae)

# Calculate the R-squared value
r2 = r2_score(val_y, predictions)
print("R-Squared:", r2)

Mean Squared Error: 0.022427107731192178
Mean Absolute Error: 0.09763238182139537
R-Squared: 0.6740582481024933
