In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import time
import re
from ipykernel import kernelapp as app
import seaborn as sns

In [2]:
## Read new csv and save as DF
stats = pd.read_csv('./nba_stats_mvp.csv').drop(columns='Unnamed: 0')
pd.set_option('display.max_columns', None)
stats

Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,season,player,win_pct,votes_first,points_won,points_max,award_share,g,mp_per_g,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48,Mvp?
0,18.6,0.2,6.5,25.1,0.572,28.4,8.0,1980-81,Julius Erving,0.756098,28.0,454.0,690.0,0.658,82.0,35.0,24.6,8.0,4.4,2.1,1.8,0.521,0.222,0.787,13.8,0.231,Yes
1,18.3,0.9,4.0,19.9,0.528,24.3,5.1,1980-81,Larry Bird,0.756098,20.0,423.0,690.0,0.613,82.0,39.5,21.2,10.9,5.5,2.0,0.8,0.478,0.270,0.863,10.8,0.160,No
2,18.2,0.0,6.9,25.5,0.616,26.3,5.3,1980-81,Kareem Abdul-Jabbar,0.658537,8.0,286.0,690.0,0.414,80.0,37.2,26.2,10.3,3.4,0.7,2.9,0.574,0.000,0.766,14.3,0.230,No
3,19.3,0.0,10.1,25.1,0.585,27.6,3.7,1980-81,Moses Malone,0.487805,8.0,180.0,690.0,0.261,80.0,40.6,27.8,14.8,1.8,1.0,1.9,0.522,0.333,0.757,13.7,0.202,No
4,21.1,0.4,7.6,22.9,0.555,32.3,1.6,1980-81,George Gervin,0.634146,1.0,83.0,690.0,0.120,82.0,33.7,27.1,5.1,3.2,1.1,0.7,0.492,0.257,0.826,10.5,0.182,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,18.0,1.2,5.3,25.0,0.570,29.1,3.3,2017-18,LaMarcus Aldridge,0.573171,0.0,6.0,1010.0,0.006,75.0,33.5,23.1,8.5,2.0,0.6,1.2,0.510,0.293,0.837,10.9,0.209,No
633,15.6,3.4,7.2,23.7,0.590,24.9,5.0,2017-18,Jimmy Butler,0.573171,0.0,5.0,1010.0,0.005,59.0,36.7,22.2,5.3,4.9,2.0,0.4,0.474,0.350,0.854,8.9,0.198,No
634,16.9,9.8,5.9,28.2,0.675,31.0,8.6,2017-18,Stephen Curry,0.707317,0.0,5.0,1010.0,0.005,51.0,32.0,26.4,5.1,6.1,1.6,0.2,0.495,0.423,0.921,9.1,0.267,No
635,16.8,3.4,7.4,22.9,0.573,33.4,2.6,2017-18,Joel Embiid,0.634146,0.0,4.0,1010.0,0.004,63.0,30.3,22.9,11.0,3.2,0.6,1.8,0.483,0.308,0.769,6.2,0.155,No


In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the list of training seasons
training_seasons = ['1980-81', '1982-83', '1998-99', '1996-97', '1997-98',
                    '1985-86', '2000-01', '1991-92', '2007-08', 
                    '1986-87', '2003-04', '2004-05', '1994-95', '2011-12',
                    '1992-93', '2002-03', '2017-18', '2014-15', 
                    '2016-17',]

# Define the list of testing seasons
testing_seasons = ['1981-82', '1984-85', '1990-91', '1988-89',
                    '2001-02', '2006-07', '1995-96', '1993-94', '1987-88', 
                    '2013-14', '2012-13', '1999-00', '2009-10',
                    '1983-84', '1989-90',  '2010-11', '2008-09', '2005-06', 
                     '2015-16']

# Filter the data to only include the training seasons
training_data = stats[stats['season'].isin(training_seasons)]

# Define the features to use in the model
features = ['ts_pct', 'bpm', 'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g']

# Split the training data into feature and target data
train_X = training_data[features]
train_y = training_data['award_share']

# Filter the data to only include the testing seasons
testing_data = stats[stats['season'].isin(testing_seasons)]

# Split the testing data into feature and target data
val_X = testing_data[features]
val_y = testing_data['award_share']

# Train a random forest regressor model
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(train_X, train_y)

# Make predictions on the validation set
predictions = rf_model.predict(val_X)

In [4]:
# Create a pandas dataframe from the validation features (val_X)
df = pd.DataFrame(val_X, columns=['ts_pct', 'bpm', 'mp_per_g','pts_per_g','trb_per_g','ast_per_g','stl_per_g','blk_per_g'])

# Add the predictions to the dataframe
df['prediction'] = predictions

# Add the validation labels (val_y) as the 'award_share' column in the dataframe
df['award_share'] = val_y

# Add the season and player information from the 'stats' dataframe to the new dataframe
df['season'] = [stats['season'][index] for index in df.reset_index()['index']]
df['player'] = [stats['player'][index] for index in df.reset_index()['index']]
df['Mvp?'] = [stats['Mvp?'][index] for index in df.reset_index()['index']]

# Reorder the columns in the dataframe
df = df[['ts_pct', 'bpm', 'mp_per_g','pts_per_g','trb_per_g','ast_per_g','stl_per_g','blk_per_g',
         'player','season','award_share','Mvp?','prediction']]


# Add a new column 'mvp_prediction' to the dataframe, initialized to 'No' for all rows
df['mvp_prediction'] = 'No'

# Group the dataframe by season and find the player with the maximum prediction in each season
season_group = df.groupby('season')
max_prediction_idx = season_group['prediction'].idxmax()

# Update the 'mvp_prediction' value for the player with the highest prediction in each season to 'Yes'
df.loc[max_prediction_idx, 'mvp_prediction'] = 'Yes'

# Display the entire dataframe
pd.set_option('display.max_rows', None)
df

Unnamed: 0,ts_pct,bpm,mp_per_g,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,player,season,award_share,Mvp?,prediction,mvp_prediction
31,0.576,4.3,42.0,31.1,14.7,1.8,0.9,1.5,Moses Malone,1981-82,0.735,Yes,0.33596,No
32,0.557,7.5,38.0,22.9,10.9,5.8,1.9,0.9,Larry Bird,1981-82,0.588,No,0.25614,No
33,0.593,7.8,34.4,24.4,6.9,3.9,2.0,1.7,Julius Erving,1981-82,0.294,No,0.44847,Yes
34,0.571,3.8,31.7,19.9,10.8,1.8,0.9,2.4,Robert Parish,1981-82,0.19,No,0.05872,No
35,0.526,4.2,36.0,23.4,3.1,6.9,2.2,0.5,Gus Williams,1981-82,0.167,No,0.02802,No
36,0.562,1.7,35.7,32.3,5.0,2.4,1.0,0.6,George Gervin,1981-82,0.159,No,0.26666,No
37,0.601,5.5,37.3,19.8,6.7,4.8,1.7,0.3,Sidney Moncrief,1981-82,0.146,No,0.07113,No
38,0.59,8.3,38.3,18.6,9.6,9.5,2.7,0.4,Magic Johnson,1981-82,0.097,No,0.33172,No
39,0.559,4.1,37.2,19.6,12.7,3.4,1.2,1.3,Jack Sikma,1981-82,0.048,No,0.02823,No
40,0.608,4.3,35.2,23.9,8.7,3.0,0.8,2.7,Kareem Abdul-Jabbar,1981-82,0.045,No,0.16271,No


In [5]:
## 36/38 seasons correct. Model is 95% accurate

In [6]:
# Calculate the mean squared error
mse = mean_squared_error(val_y, predictions)
print("Mean Squared Error:", mse)

# Calculate the mean absolute error
mae = mean_absolute_error(val_y, predictions)
print("Mean Absolute Error:", mae)

# Calculate the R-squared value
r2 = r2_score(val_y, predictions)
print("R-Squared:", r2)

Mean Squared Error: 0.03905771107435898
Mean Absolute Error: 0.12900673076923075
R-Squared: 0.4005633360068164


In [7]:
#The mean squared error (MSE) of 0.022307539351587306 indicates that, on average, the model's predictions deviate by 0.022 from the actual target values. This value is expressed in the units of the target variable, so you can use it to determine the magnitude of the error. The smaller the MSE, the better the model's predictions.

#The mean absolute error (MAE) of 0.09895563492063493 indicates that, on average, the model's predictions deviate by 0.099 from the actual target values. This value is expressed in the units of the target variable and gives a more interpretable measure of the model's accuracy, since it is expressed in the same units as the target variable.

#The R-Squared value of 0.6757959811881447 indicates the proportion of variance in the target variable that can be explained by the features. An R-Squared value of 0.675 means that 

In [8]:
# Calculate the accuracy of the model
accuracy = (df['mvp_prediction'] == df['Mvp?']).mean()
print("Model Accuracy:", accuracy)

Model Accuracy: 0.9166666666666666


In [9]:
# Calculate the accuracy score
accuracy = (df['mvp_prediction'] == df['Mvp?']).sum() / len(df)

# Print the accuracy score
print('Accuracy score: {:.2f}%'.format(accuracy * 100))

Accuracy score: 91.67%
