In [1]:
import sklearn.linear_model as lm
import sklearn.ensemble as en
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Year values
vals = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22']

# X represents the three major MVP predictors (WS/48, PER, VORP); y represents the MVP award share
X = []
y = []


for val in vals:
    # File paths
    x1_path = f'../data/{val}.csv'
    x2_path = f'../data/{val} Standings.csv'
    y_path = f'../data/{val} MVP.csv'

    # Dataframes
    df_x1 = pd.read_csv(x1_path, usecols=["Player", "Tm", "WS/48", "PER", "VORP"])
    df_x2 = pd.read_csv(x2_path, usecols=["Rk", "Tm", "Overall"])
    df_y = pd.read_csv(y_path, usecols=["Player", "Share"])

    # Get output
    output = pd.merge(df_x1, df_x2, on="Tm", how="inner")
    output = pd.merge(output, df_y, on="Player", how="inner")

    # Add new data to X and y for Multiple Linear Regression
    X.append(output.drop(columns=["Share", "Player", "Tm"]))
    y.append(output.drop(columns=["Player", "Tm", "Rk", "Overall", "WS/48", "PER", "VORP"]))
    '''
    # Win Share Plot
    fig, ax = plt.subplots()
    plt.scatter(output["WS/48"], output["Share"])
    plt.xlabel("Win Shares per 48 Minutes")
    plt.ylabel("MVP Award Share")
    plt.title(f'{val} Win Shares to MVP Award Share')

    for i in range(len(output["WS/48"])):
        ax.text(output["WS/48"][i], output["Share"][i], output["Player"][i], rotation=30)

    plt.show()

    # Player Efficiency Rating Plot
    fig, ax = plt.subplots()
    plt.scatter(output["PER"], output["Share"])
    plt.xlabel("Player Efficiency Rating", )
    plt.ylabel("MVP Award Share")
    plt.title(f'{val} Player Efficiency to MVP Award Share')

    for i in range(len(output["PER"])):
        ax.text(output["PER"][i], output["Share"][i], output["Player"][i], rotation=30)

    plt.show()

    # Value Over Replacement Player Plot
    fig, ax = plt.subplots()
    plt.scatter(output["VORP"], output["Share"])
    plt.xlabel("Value Over Replacement Player")
    plt.ylabel("MVP Award Share")
    plt.title(f'{val} Value Over Replacement to MVP Award Share')

    for i in range(len(output["VORP"])):
        ax.text(output["VORP"][i], output["Share"][i], output["Player"][i], rotation=30)

    plt.show()

    # Rank Plot
    fig, ax = plt.subplots()
    plt.scatter(output["Rk"], output["Share"])
    plt.xlabel("NBA Team Standing")
    plt.ylabel("MVP Award Share")
    plt.title(f'{val} NBA Team Standing to MVP Award Share')

    for i in range(len(output["Rk"])):
        ax.text(output["Rk"][i], output["Share"][i], output["Player"][i], rotation=30)

    plt.show()

    # Overall Record Plot
    fig, ax = plt.subplots()
    plt.scatter(output["Overall"], output["Share"])
    plt.xlabel("NBA Team Overall Record")
    plt.ylabel("MVP Award Share")
    plt.title(f'{val} NBA Team Record to MVP Award Share')

    for i in range(len(output["Overall"])):
        ax.text(output["Overall"][i], output["Share"][i], output["Player"][i], rotation=30)

    plt.show()
'''
# Dataframe concatenation for X and y variables
X = pd.concat(X)
y = pd.concat(y)
# print(X)
# print(y)

# LR Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
# print(X_test)
model = lm.LinearRegression()
fit = model.fit(X_train, y_train)

# LR Predictions
predictions = model.predict(X_test)
print('Linear Regression: ')
print('Mean Squared Error : ', mean_squared_error(y_test, predictions))
print('Mean Absolute Error : ', mean_absolute_error(y_test, predictions))
print('R2 Score : ', r2_score(y_test, predictions))

print()

# Gradient Boost Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)
# print(X_test)
model = en.GradientBoostingRegressor()
fit = model.fit(X_train, y_train.values.ravel())

# Gradient Boost Predictions
predictions = model.predict(X_test)
print('Gradient Boosting Regressor: ')
print('Mean Squared Error : ', mean_squared_error(y_test, predictions))
print('Mean Absolute Error : ', mean_absolute_error(y_test, predictions))
print('R2 Score : ', r2_score(y_test, predictions))

print()

# Random Forest Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
# print(X_test)
model = en.RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)
fit = model.fit(X_train, y_train.values.ravel())

# Random Forest Predictions
predictions = model.predict(X_test)
print('Random Forest Regressor: ')
print('Mean Squared Error : ', mean_squared_error(y_test, predictions))
print('Mean Absolute Error : ', mean_absolute_error(y_test, predictions))
print('R2 Score : ', r2_score(y_test, predictions))


Linear Regression: 
Mean Squared Error :  0.032091885185504654
Mean Absolute Error :  0.14364623906084578
R2 Score :  0.5770294500838935

Gradient Boosting Regressor: 
Mean Squared Error :  0.0344974143134883
Mean Absolute Error :  0.11242086922712251
R2 Score :  0.7335429679274668

Random Forest Regressor: 
Mean Squared Error :  0.00987231854035006
Mean Absolute Error :  0.055811149440836934
R2 Score :  0.8400931007881509


In [43]:
yr = input('Enter the NBA year in the form XXXX-XX.')

try:
    df_x1 = pd.read_csv(f'../data/{yr}.csv', usecols=["Player", "Tm", "WS/48", "PER", "VORP", "G"]).dropna()
    df_x2 = pd.read_csv(f'../data/{yr} Standings.csv', usecols=["Rk", "Tm", "Overall"])
    output = pd.merge(df_x1, df_x2, on="Tm", how="inner")
    predictions = model.predict(output.drop(columns=["Player", "G", "Tm"]))
    res = []
    worst = [99999999, '']
    brooks = ''
    for i in range(len(predictions)):
        if predictions[i] > 0.1 and output["G"][i] > 50:
            res.append([float(predictions[i]), output["Player"][i]])
        else:
            if output["G"][i] > 50 and float(predictions[i]) < worst[0]:
                worst = [float(predictions[i]), output["Player"][i]]
            if output["Player"][i] == "Dillon Brooks":
                brooks = "The brick has a predicted vote share of " + f'{predictions[i]:.3f}' + "."
    res.sort(reverse=True)
    for i in range(len(res)):
        player = res[i]
        if i == 0:
            print("The predicted MVP is " + player[1] + ", with a vote share of " + f'{player[0]:.3f}' + ".")
            print("\nOther notable predicted vote shares:")
        else:
            print(player[1] + " - " + f'{player[0]:.3f}')
    print()
    print("The lowest predicted player is " + worst[1] + ", with a predicted vote share of " + f'{worst[0]:.3f}' + ".")
    print()
    print(brooks)
except FileNotFoundError:
    print('Either the advanced player statistics or the standings file is missing.')
x = pd.merge(df_x1, df_x2, on="Tm", how="inner")



The predicted MVP is Nikola Jokić, with a vote share of 0.887.

Other notable predicted vote shares:
Joel Embiid - 0.779
Luka Dončić - 0.329
Giannis Antetokounmpo - 0.323
Jimmy Butler - 0.230

The lowest predicted player is Kristaps Porziņģis, with a predicted vote share of 0.006.

The brick has a predicted vote share of 0.018.
