In [2]:
#importing relevant libraries
#pandas for data org
#BeautifulSoup for web scraping
#sklearn for regression
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

#function to create lists of strings
def mvp_list_creation(anylist):
    newlist = []
    for item in anylist:
        newlist.append(str(item.string))
    return newlist

#function to get rid of unnecessary items in mvplist1
def unnecessary_removal1(anylist):
    x = 0
    newlist = anylist[3:72]
    newlist = newlist[::2]
    return(newlist)

#function to get rid of unnecessary items in mvplist2
def unnecessary_removal2(anylist):
    newlist = []
    x = 35
    for item in anylist:
        if item.startswith('19') or item.startswith('20'):
            newlist.append(item)
        else:
            continue
    while x < len(newlist):
        newlist.remove(newlist[x])
    return newlist

#function to combine mvp name and mvp year
def combination_mvp(x, y):
    newlist = list(zip(x,y))
    return newlist

#web scraping nba mvp information
mvpsite= requests.get('http://www.espn.com/nba/history/awards/_/id/33')
mvpsitecontents = BeautifulSoup(mvpsite.content, "html.parser")
mvpdata1 = mvpsitecontents.find_all("a")
mvpdata2 = mvpsitecontents.find_all("td")
mvplist1 = mvp_list_creation(mvpdata1)
mvplist2 = mvp_list_creation(mvpdata2)
mvplist1_better = unnecessary_removal1(mvplist1)
mvplist2_better = unnecessary_removal2(mvplist2)
combined_mvp = combination_mvp(mvplist1_better, mvplist2_better)

#converting tuple to a dataframe
mvp_players = pd.DataFrame(combined_mvp, columns = ['Player', 'Year'])
x = 0
mvpp = []
while x < 35:
    mvpp.append(1)
    x += 1
mvp_players = mvp_players.assign(MVP = mvpp)

#reading in nba player data from 96/97 to 20/21
nbaplayers = pd.read_csv('all_seasons.csv')

#getting rid of unnecessary columns in nba player data
nbaplayers = nbaplayers.drop(['Unnamed: 0'], axis = 1)

#changing the name of the nbaplayers dataframe columns to match the mvp dataframe
nbaplayers.rename(columns = {'player_name':'Player'}, inplace = True)

#adding a column called year to the nbaplayers dataframe that matches the mvp dataframe 
nbaplayeryears= []
yearslist = nbaplayers.loc[:,'season']
for year in yearslist:
    nbaplayeryears.append(int(year[:4]) + 1)
nbaplayers = nbaplayers.assign(Year = nbaplayeryears)

#merging the player and mvp dataframes on common columns
mvp_players["Year"] = pd.to_numeric(mvp_players["Year"])
total = pd.merge(nbaplayers, mvp_players, how = 'left', on = ['Player','Year'])
total.rename(columns = {'team_abbreviation':'Team', 'age':'Age', 'player_height':'Height', 'player_weight':'Weight', 'college':'College', 'country':'Country', 'draft_year':'Draft Year','draft_round':'Draft Round', 'draft_number': 'Draft Number', 'gp':'Games Played', 'pts':'Points per Game', 'reb':'Rebounds per Game', 'ast': 'Assists per Game', 'net_rating': 'Net Rating', 'oreb_pct':'Offensive Rebound %', 'dreb_pct': 'Defensive Rebound %', 'usg_pct':'Usage %', 'ts_pct':'True Shooting %', 'ast_pct':'Assist %', 'season':"Season"}, inplace = True)
total['MVP'] = total['MVP'].fillna(0)

#Performing a training/testing split of the data 80/20
x = total[["Games Played", "Points per Game", "Rebounds per Game", "Assists per Game", 'Net Rating', 'Offensive Rebound %', 'Defensive Rebound %', 'Usage %', 'True Shooting %', 'Assist %']]
y = total.iloc[:,22]
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state = 1)

#Building the Logistic Regression Model
log_reg = LogisticRegression(max_iter = 1000)
log_reg.fit(x_train, y_train)
log_reg.coef_ = np.array([[ 0.11740695, 0.39550122, 0.36480759, 0.61305375, 0.08081362, 0.0109478, 0.00569516, -0.04038372, 0.36738365, 0.02399376]])

#Building the Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

#Making a Prediction using the Logistic Regression Model
y_pred = log_reg.predict(x_test)

#Making a Prediction using the Linear Regression Model
y_pred_lin = lin_reg.predict(x_test)

#Display the Confusion Matrix for the Logistic Regression Model
confusion_matrix(y_test, y_pred)

#Creating a dataframe to see our predictions and sending it to excel
predictions = x_test.assign(MVP = y_pred)
predictions = predictions.assign(MVP_Num = y_pred_lin)
final_predictions = pd.merge(total,predictions,how = 'left', on = ["Games Played", "Points per Game", "Rebounds per Game", "Assists per Game", 'Net Rating', 'Offensive Rebound %', 'Defensive Rebound %', 'Usage %', 'True Shooting %', 'Assist %'])
final_predictions.rename(columns = {'MVP_x':'Actual MVP', 'MVP_y':'Predicted MVP', 'MVP_Num':'Lin_Reg Output'}, inplace = True)
final_predictions['Predicted MVP'] = final_predictions['Predicted MVP'].fillna("Training Data")
final_predictions.to_excel("nbaMVPpredictions.xlsx")