In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn import tree, svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

# determine the file names of associated player data
main_directory = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_directory = os.path.join(main_directory, 'data')
file_names = [f for f in os.listdir(data_directory) if os.path.isfile(os.path.join(data_directory, f))]
print(file_names)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (/Users/alecchae/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py)

In [None]:
# seperates out data associated with total player stats and player stats per game
player_stat_total_csv = [file for file in file_names if 'stats.csv' in file]
player_stat_pergame_csv = [file for file in file_names if 'game.csv' in file]
player_stat_total_csv = player_stat_total_csv[:-2]
print(player_stat_total_csv)
print(player_stat_pergame_csv)

In [None]:
# determine the name of the headers associated with the data in the csv files (we will do this to ensure that all files are consistent)
directory = os.path.join(data_directory, player_stat_total_csv[0])
# read the 2000 total player stats into a data frame
df = pd.read_csv(directory)
# determine the headers of the data frame
headers = df.columns
print(headers)

In [None]:
# open all total player stat csv and combine them into one large data frame
# define player_stat_df as a dataframe
player_stat_df = pd.DataFrame()
MVP_player_data = pd.read_csv('MVP player stats.csv')
defensive_player_data = pd.read_csv('Defensive Player of the Year player stats.csv')
i = 23

# for all files in the player_stat_total_csv vector open them and add them to the combined vector
for filename in player_stat_total_csv:
    # add counter variable so MVP and Denfensive player dataframes can be indexed
    i = i - 1
    directory = os.path.join(data_directory,filename)
    # extract data from one year
    df = pd.read_csv(directory)
    # extract the year the player data is from
    season = (filename[0:2] + filename[3:5])
    # add the year to the data frame
    df['Year'] = season
    # remove the asterisk from the end of players names
    df['Player'] = df['Player'].str.replace('*', '')
    # current MVP and Defensive player winners
    current_MVP = MVP_player_data['Unnamed: 2'][i]
    current_defense = defensive_player_data['Unnamed: 2'][i]
    # adds information about winners of MVP and Defense award to dataframe
    df['MVP Award'] = df['Player'].apply(lambda x: 1 if x == current_MVP else 0)
    df['Defensive Award'] = df['Player'].apply(lambda x: 1 if x == current_defense else 0)
    # if the data has the correct number of headers it is added to the new dataframe
    if len(df.columns) == (len(headers) + 3):
        player_stat_df = pd.concat([player_stat_df, df])
    else:
        print('this file did not match the header ',filename)

    
player_stat_df = player_stat_df.reset_index(drop=True)
display(player_stat_df.head())

In [None]:
# repeat the last two cell aboves to create a dataframe containing all player stat per game data
directory = os.path.join(data_directory, player_stat_pergame_csv[0])
df = pd.read_csv(directory)
headers = df.columns
print(headers)

In [None]:
player_stat_game_df = pd.DataFrame()
i = 23

# for all files in the player_stat_pergame_csv vector open them and add them to the combined dataframe
for filename in player_stat_pergame_csv:
    directory = os.path.join(data_directory,filename)
    df = pd.read_csv(directory)
    # add counter variable so MVP and Denfensive player dataframes can be indexed
    i = i - 1
    directory = os.path.join(data_directory,filename)
    # extract data from one year
    df = pd.read_csv(directory)
    # extract the year the player data is from
    season = (filename[0:2] + filename[3:5])
    # add the year to the data frame
    df['Year'] = season
    # remove the asterisk from the end of players names
    df['Player'] = df['Player'].str.replace('*', '')
    # current MVP and Defensive player winners
    current_MVP = MVP_player_data['Unnamed: 2'][i]
    current_defense = defensive_player_data['Unnamed: 2'][i]
    # adds information about winners of MVP and Defense award to dataframe
    df['MVP Award'] = df['Player'].apply(lambda x: 1 if x == current_MVP else 0)
    df['Defensive Award'] = df['Player'].apply(lambda x: 1 if x == current_defense else 0)
    # if the data has the correct number of headers it is added to the new dataframe
    if len(df.columns) == (len(headers) + 3):
        player_stat_game_df = pd.concat([player_stat_game_df, df])
    else:
        print('this file did not match the header ',filename)
    
player_stat_game_df = player_stat_game_df.reset_index(drop=True)
display(player_stat_game_df.head())

In [None]:
# save new dataframes into a csv
player_stat_game_df.to_csv('player_stat_per_game.csv',header=True,index=False)
player_stat_df.to_csv('player_stat.csv',header=True,index=False)

In [None]:
# clean total player data
# create more valuable column titles
name_map = {'Rk':'Rank','Pos':'Position','Tm':'Team','G':'Games','GS':'Games Started','MP':'Total Minutes Played',
             'FG':'Total Field Goals','FGA':'Total Field Goal Attempts','FG%':'Field Goal Percentage','3P':'Total 3 Point Field Goals',
             '3PA':'Total 3 Point Field Goal Attempts', '3P%':'3 Point Field Goal Percentage','2P':'Total 2 Point Field Goals', '2PA':'Total 2 Point Field Goal Attempts',
             '2P%':'2 Point Field Goal Percentage', 'eFG%':'Effective Field Goal Percentage', 'FT':'Total Free Throws',
             'FTA':'Total Free Throw Attempts','FT%':'Free Throw Percentage','ORB':'Total Offensive Rebounds','DRB':'Total Defensive Rebounds',
             'TRB':'Total Rebounds','AST':'Total Assists','STL':'Total Steals','BLK':'Total Blocks','TOV':'Total Turn Overs',
             'PF':'Total Personal Fouls','PTS':'Total Points'}
player_stat_df = player_stat_df.rename(columns=name_map)

# drop unecessary column
player_stat_df = player_stat_df.drop(columns = ['Player-additional'])

In [None]:
# determine the number of NaN values in each column
display(player_stat_df.isnull().sum())

# visualize where the NaN values are
pd.set_option('display.max_columns', None)
display(player_stat_df.head(20))

# many NaN values are from players with 0 made shots, therefore the make percentage is NaN
# the following code will replace the appropriate percentages with 0 in the cases of NaN
result = player_stat_df['Total Field Goals'] / player_stat_df['Total Field Goal Attempts']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_df['Field Goal Percentage'] = result

result = player_stat_df['Total 3 Point Field Goals'] / player_stat_df['Total 3 Point Field Goal Attempts']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_df['3 Point Field Goal Percentage'] = result

result = player_stat_df['Total 2 Point Field Goals'] / player_stat_df['Total 2 Point Field Goal Attempts']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_df['2 Point Field Goal Percentage'] = result

result = player_stat_df['Total Free Throws'] / player_stat_df['Total Free Throw Attempts']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_df['Free Throw Percentage'] = result

result = (player_stat_df['Total Field Goals'] + 0.5 *  player_stat_df['Total 3 Point Field Goals'])/ player_stat_df['Total Field Goal Attempts']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_df['Effective Field Goal Percentage'] = result

# check to make sure that NaN values have been removed
display(player_stat_df.isnull().sum())

In [None]:
# save the cleaned player stats to a new csv file
player_stat_df.to_csv('cleaned_player_stat.csv',index=False,header=True)

In [None]:
# clean player stats per game
# create more valuable column titles
name_map2 = name_map = {'Rk':'Rank','Pos':'Position','Tm':'Team','G':'Games','GS':'Games Started','MP':'Minutes Played Per Game',
             'FG':'Field Goals Per Game','FGA':'Field Goal Attempts Per Game','FG%':'Field Goal Percentage','3P':'3 Point Field Goals Per Game',
             '3PA':'3 Point Field Goal Attempts Per Game', '3P%':'3 Point Field Goal Percentage','2P':'2 Point Field Goals Per Game', '2PA':'2 Point Field Goal Attempts Per Game',
             '2P%':'2 Point Field Goal Percentage', 'eFG%':'Effective Field Goal Percentage', 'FT':'Free Throws Per Game',
             'FTA':'Free Throw Attempts Per Game','FT%':'Free Throw Percentage','ORB':'Offensive Rebounds Per Game','DRB':'Defensive Rebounds Per Game',
             'TRB':'Total Rebounds Per Game','AST':'Assists Per Game','STL':'Steals Per Game','BLK':'Blocks Per Game','TOV':'Turn Overs Per Game',
             'PF':'Personal Fouls Per Game','PTS':'Points Per Game'}
player_stat_game_df = player_stat_game_df.rename(columns=name_map)

# drop unecessary column
player_stat_game_df = player_stat_game_df.drop(columns = ['Player-additional'])

In [None]:
# determine the number of NaN values in each column
player_stat_game_df.isnull().sum()

# visualize where the NaN values are
pd.set_option('display.max_columns', None)
display(player_stat_game_df.head(20))

# many NaN values are from players with 0 made shots, therefore the make percentage is NaN
# the following code will replace the appropriate percentages with 0 in the cases of NaN
result = player_stat_game_df['Field Goals Per Game'] / player_stat_game_df['Field Goal Attempts Per Game']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_game_df['Field Goal Percentage'] = result

result = player_stat_game_df['3 Point Field Goals Per Game'] / player_stat_game_df['3 Point Field Goal Attempts Per Game']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_game_df['3 Point Field Goal Percentage'] = result

result = player_stat_game_df['2 Point Field Goals Per Game'] / player_stat_game_df['2 Point Field Goal Attempts Per Game']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_game_df['2 Point Field Goal Percentage'] = result

result = player_stat_game_df['Free Throws Per Game'] / player_stat_game_df['Free Throw Attempts Per Game']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_game_df['Free Throw Percentage'] = result

result = (player_stat_game_df['Field Goals Per Game'] + 0.5 *  player_stat_game_df['3 Point Field Goals Per Game'])/ player_stat_game_df['Field Goal Attempts Per Game']
result = result.replace([np.inf, -np.inf, np.nan], 0)
player_stat_game_df['Effective Field Goal Percentage'] = result

player_stat_game_df.to_csv('cleaned_player_stat_per_game.csv',index=False,header=True)

In [None]:
# determine the average stats for each season
# create new player data data frame that only includes columns which we would like to get the seasonal averages of
player_stat_game_df2 = player_stat_game_df.drop(columns = ['Player', 'Position', 'Team', 'MVP Award', 'Defensive Award', 'Rank', 'Age'])
# create new data frame that contains the seasonal averages of relevant statistics
average_stat_game_df = player_stat_game_df2.groupby('Year').mean()
# merge original data with season average data
merged_data = pd.merge(player_stat_game_df, average_stat_game_df, on='Year', suffixes=('', '_avg'))
# divide each statisitic by the corresponding season average
for stat in ['Games', 'Games Started', 'Minutes Played Per Game', 'Field Goals Per Game', 'Field Goal Attempts Per Game', 'Field Goal Percentage', '3 Point Field Goals Per Game', '3 Point Field Goal Attempts Per Game', '3 Point Field Goal Percentage', '2 Point Field Goals Per Game', '2 Point Field Goal Attempts Per Game', '2 Point Field Goal Percentage', 'Effective Field Goal Percentage', 'Free Throws Per Game', 'Free Throw Attempts Per Game', 'Free Throw Percentage', 'Offensive Rebounds Per Game', 'Defensive Rebounds Per Game', 'Total Rebounds Per Game', 'Assists Per Game', 'Steals Per Game', 'Blocks Per Game', 'Turn Overs Per Game', 'Personal Fouls Per Game', 'Points Per Game']:
    merged_data[stat] = merged_data[stat] / merged_data[f'{stat}_avg']
    merged_data.drop(columns=[f'{stat}_avg'], inplace=True)

player_average_game_df = merged_data
display(player_average_game_df)

In [None]:
# create a heat map to visualize the correlation between variables for both total player stats and player stats per game
# remove non numerical data so correlation matrix can be obtained
player_stat_game_df2 = player_stat_game_df.drop(columns = ['Player', 'Position', 'Team'])
player_stat_df2 = player_stat_df.drop(columns = ['Player', 'Position', 'Team'])

# create the correlation matrices
corr_matrix_total = player_stat_game_df2.corr(method = 'pearson')
corr_matrix_game = player_stat_df2.corr(method = 'pearson')

# plot the heat map for total player stats
label = ['Rank', 'Age', 'Games', 'Games Started', 'Total Minutes Played', 'Total Field Goals', 'Total Field Goal Attempts', 'Field Goal Percentage',
        'Total 3 Point Field Goals', 'Total 3 Point Field Goal Attempts', '3 Point Field Goal Percentage', 'Total 2 Point Field Goals',
        'Total 2 Point Field Goal Attempts', '2 Point Field Goal Percentage', 'Effective Field Goal Percentage', 'Total Free Throws',
        'Total Free Throw Attempts', 'Free Throw Percentage', 'Total Offensive Rebounds', 'Total Defensive Rebounds', 'Total Rebounds',
        'Total Assists', 'Total Steals', 'Total Blocks', 'Total Turnovers', 'Total Personal Fouls', 'Total Points', 'Year', 'MVP Award', 'Defense Award']
plt.imshow(corr_matrix_total, vmin = -1, vmax = 1)
plt.colorbar()
plt.xticks(np.arange(len(label)), label, fontsize = 7, rotation = 'vertical')
plt.yticks(np.arange(len(label)), label, fontsize = 7)
plt.title('Correlation Matrix For Total Player Stats')
plt.show()

# plot the heat map for per game player stats
label2 = ['Rank', 'Age', 'Games', 'Games Started', 'Minutes Played Per Game', 'Field Goals Per Game', 'Field Goal Attempts Per Game', 'Field Goal Percentage',
        '3 Point Field Goals Per Game', '3 Point Field Goal Attempts Per Game', '3 Point Field Goal Percentage', '2 Point Field Goals Per Game',
        '2 Point Field Goal Attempts Per Game', '2 Point Field Goal Percentage', 'Effective Field Goal Percentage', 'Free Throws Per Game',
        'Free Throw Attempts Per Game', 'Free Throw Percentage', 'Offensive Rebounds Per Game', 'Defensive Rebounds Per Game', 'Rebounds Per Game',
        'Assists Per Game', 'Steals Per Game', 'Blocks Per Game', 'Turnovers Per Game', 'Personal Fouls Per Game', 'Points Per Game', 'Year', 'MVP Award', 'Defense Award']
plt.imshow(corr_matrix_game, vmin = -1, vmax = 1)
plt.colorbar()
plt.xticks(np.arange(len(label2)), label2, fontsize = 7, rotation = 'vertical')
plt.yticks(np.arange(len(label2)), label2, fontsize = 7)
plt.title('Correlation Matrix For Average Player Stats Per Game')
plt.show()

In [None]:
# create scatter plot matrix for total player stats while highlighting MVP winners
# create new dataframe so only helpful columns are included in the scatter plot matrix
# player_stat_game_df3 = player_stat_game_df[['Games', 'Minutes Played Per Game', 'Field Goal Percentage', '3 Point Field Goal Percentage',
#                                           '2 Point Field Goal Percentage', 'Free Throw Percentage', 'Total Rebounds Per Game', 'Assists Per Game',
#                                           'Steals Per Game', 'Blocks Per Game', 'Turn Overs Per Game', 'Personal Fouls Per Game', 'Points Per Game', 'MVP Award', 'Defensive Award']]
# plot the scatter matrix
# sns.pairplot(player_stat_game_df3, hue = 'MVP Award')
# sns.pairplot(player_stat_game_df3, hue = 'Defensive Award')
# plt.show()

In [None]:
# create new data frame with predictor data
player_average_game_df2 = player_average_game_df.drop(columns = ['Player', 'Position', 'Team', 'MVP Award', 'Defensive Award'])
# create data filters for splitting up testing and training data
train_years = ['0001', '0102', '0203', '0304', '0405', '0506', '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
              '1415', '1516', '1617', '1718', '1819', '1920', '2021']
test_year = ['2122']
# Filter both x_train and y_train based on train_years
x_train = player_average_game_df2[player_average_game_df2['Year'].isin(train_years)].to_numpy()
y_train = player_average_game_df[player_average_game_df['Year'].isin(train_years)]['MVP Award'].to_numpy()
# Filter both x_test and y_test based on train_years
x_test_first = player_average_game_df2[player_average_game_df2['Year'].isin(test_year)].to_numpy()
y_test_first = player_average_game_df[player_average_game_df['Year'].isin(test_year)]['MVP Award'].to_numpy()

# initialize old confusion matrix and model accuracy
confusion_matrix_old = [[0, 100],[0, 0]]
model_accuracy = []

# for loop used to determine the optimal class weights
for i in range(500, 5000, 100):
    for j in range(10, 100, 10):
        # Define class weights
        class_weights = {0: 1, 1: i}
        # create svm model
        svm_model = svm.SVC(kernel = 'rbf', C = j, gamma = 'scale', class_weight = class_weights)
        # train the nearest neighbor model with the training data 
        svm_model.fit(x_train, y_train)
        # get model predictions
        y_prediction = svm_model.predict(x_test_first)
        # determine the confusion matrix with the confusion_matrix function
        confusion_matrix = metrics.confusion_matrix(y_test_first, y_prediction)
        # determine ideal parameters
        if confusion_matrix[1][1] == 1 and confusion_matrix[0][1] < confusion_matrix_old[0][1]:
            best_weight = i
            best_c = j
            confusion_matrix_old = confusion_matrix

In [None]:
# display results from previous cell
print('Best Weight: ')
print(best_weight)
print('Best C Value: ')
print(best_c)

# develop svm model using the ideal parameters
ideal_class_weights = {0:1, 1:best_weight}

# develop the ideal svm model
svm_model_first = svm.SVC(kernel = 'rbf', C = best_c, gamma = 'scale', class_weight = ideal_class_weights)
# train the nearest neighbor model with the training data 
svm_model_first.fit(x_train, y_train)
# get model predictions
y_prediction = svm_model_first.predict(x_test_first)
# assess accuracy using the accuracy_score function
model_accuracy = metrics.accuracy_score(y_test_first, y_prediction)
# determine the confusion matrix with the confusion_matrix function
confusion_matrix = metrics.confusion_matrix(y_test_first, y_prediction)
print('Model Accuracy: ')
print(model_accuracy)
print('Confusion Matrix: ')
print(confusion_matrix)

# create placeholder data frame that will be used later
x_test1 = player_average_game_df2[player_average_game_df2['Year'].isin(test_year)]

# Create a DataFrame to store the predictions and corresponding players
predictions_df = player_average_game_df.loc[x_test1.index].copy()
predictions_df['First Prediction'] = y_prediction

# Filter the DataFrame to get the rows where the model predicted MVPs
predicted_mvp_df = predictions_df[predictions_df['First Prediction'] == 1]

# Print or display the predicted MVP players
display(predicted_mvp_df)

In [None]:
# this cell develops a second svm model which is used to predict the MVPs from the pool of previously predicted MVPs
# create new test and train data from the results of previous run do this in an effort to decrease number of predicted MVPs
x_test_second = predicted_mvp_df.drop(columns = ['First Prediction', 'Player', 'Position', 'Team', 'MVP Award', 'Defensive Award']).to_numpy()
y_test_second = predicted_mvp_df['MVP Award'].to_numpy()

# initialize old confusion matrix and model accuracy
confusion_matrix_old = [[0, 100],[0, 0]]
model_accuracy = []

# for loop used to determine the optimal class weights
for i in range(10, 100, 10):
    for j in range(10, 100, 10):
        # Define class weights
        class_weights = {0: 1, 1: i}
        # create svm model
        svm_model = svm.SVC(kernel = 'rbf', C = j, gamma = 'scale', class_weight = class_weights)
        # train the nearest neighbor model with the training data 
        svm_model.fit(x_train, y_train)
        # get model predictions
        y_prediction = svm_model.predict(x_test_second)
        # determine the confusion matrix with the confusion_matrix function
        confusion_matrix = metrics.confusion_matrix(y_test_second, y_prediction)
        # determine ideal parameters
        if confusion_matrix[1][1] == 1 and confusion_matrix[0][1] < confusion_matrix_old[0][1]:
            best_weight2 = i
            best_c2 = j
            confusion_matrix_old = confusion_matrix

In [None]:
# display results from previous cell
print('Best Weight: ')
print(best_weight2)
print('Best C Value: ')
print(best_c2)

# develop svm model using the ideal parameters
ideal_class_weights = {0:1, 1:best_weight2}

# develop the ideal svm model
svm_model_second = svm.SVC(kernel = 'rbf', C = best_c2, gamma = 'scale', class_weight = ideal_class_weights)
# train the nearest neighbor model with the training data 
svm_model_second.fit(x_train, y_train)
# get model predictions
y_prediction = svm_model_second.predict(x_test_second)
# assess accuracy using the accuracy_score function
model_accuracy = metrics.accuracy_score(y_test_second, y_prediction)
# determine the confusion matrix with the confusion_matrix function
confusion_matrix = metrics.confusion_matrix(y_test_second, y_prediction)
print('Model Accuracy: ')
print(model_accuracy)
print('Confusion Matrix: ')
print(confusion_matrix)

# add final predicted winners to dataframe
predicted_mvp_df.loc[:, 'Second Prediction'] = y_prediction
display(predicted_mvp_df)

In [None]:
years = ['0001', '0102', '0203', '0304', '0405', '0506', '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314','1415',
         '1516', '1617', '1718', '1819', '1920', '2021', '2122']

complete_predicted_mvp_df = pd.DataFrame()
complete_predicted_mvp_df = complete_predicted_mvp_df.reindex(columns = predicted_mvp_df.columns)

for year in years:
    # test model to see how it predicts the winner of each year
    x_test_final_first = player_average_game_df2[player_average_game_df2['Year'].isin([year])].to_numpy()
    y_test_final_first = player_average_game_df[player_average_game_df['Year'].isin([year])]['MVP Award'].to_numpy()
    
    # get the prediction from the first model
    y_prediction_first = svm_model_first.predict(x_test_final_first)
    
    # create placeholder data frame that will be used later
    x_test1 = player_average_game_df2[player_average_game_df2['Year'].isin([year])]
    
    # Create a DataFrame to store the predictions and corresponding players
    predictions_df = player_average_game_df.loc[x_test1.index].copy()
    predictions_df['First Prediction'] = y_prediction_first
    
    # Filter the DataFrame to get the rows where the model predicted MVPs
    predicted_mvp_df = predictions_df[predictions_df['First Prediction'] == 1]
    
    # create the second set of test data based on results from the first model
    x_test_final_second = predicted_mvp_df.drop(columns = ['First Prediction', 'Player', 'Position', 'Team', 'MVP Award', 'Defensive Award']).to_numpy()
    y_test_final_second = predicted_mvp_df['MVP Award'].to_numpy()
    
    # get the prediction from the second model
    y_prediction_second = svm_model_second.predict(x_test_final_second)
    
    # add final predicted winners to dataframe
    predicted_mvp_df.loc[:, 'Second Prediction'] = y_prediction_second
    
    # fill out data frame with all the predicted winners
    complete_predicted_mvp_df = pd.concat([complete_predicted_mvp_df, predicted_mvp_df], ignore_index = True)
    
display(complete_predicted_mvp_df)

In [None]:
# determine how well the two models are able to predict the winner of the MVP award
predicted_mvp_winners = complete_predicted_mvp_df[complete_predicted_mvp_df['Second Prediction'] == 1]
display(predicted_mvp_winners)
num_true_winners = len(years)
pred_true_winners = predicted_mvp_winners['MVP Award'].sum()
percent_picked_true_winners = (pred_true_winners/num_true_winners)*100
print('Percentage that model predicts the true winner: ')
print(percent_picked_true_winners)