# Knowledge discovery project
## NBA 2021/2022 dataset - https://www.kaggle.com/vivovinco/nba-player-stats
### Nina Masaryková a Marek Štrba
March 2022

#### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from numpy import mean
from numpy import std
import operator

#### Loading data

In [None]:
filename = "2021-2022_NBA_Player_Stats.csv"
df = pd.read_csv(filename,index_col=0, encoding = "ISO-8859-1",  sep=';')
df.head(9)

In [None]:
df.info()

In [None]:
df.describe()

#### Name fixing

In [None]:
df.loc[df["Player"] == "Nikola Joki?", "Player"] = 'Nikola Jokic'
df.loc[df["Player"] == "Luka Don?i?", "Player"] = 'Luka Doncic'

#### Adding ALLSTAR collumn

In [None]:
all_star = [0]*734

In [None]:
df['ALLSTAR'] = all_star

In [None]:
df.loc[df['Player'].isin(['Trae Young', 'DeMar DeRozan', 'Joel Embiid', 'Kevin Durant', 'Giannis Antetokounmpo', 'LaMelo Ball', 
'Darius Garland', 'James Harden', 'Zach LaVine', 'Fred VanVleet', 'Jimmy Butler', 'Khris Middleton', 'Jayson Tatum', 'Jarrett Allen',
'Stephen Curry', 'Ja Morant', 'Nikola Jokic', 'LeBron James', 'Andrew Wiggins', 'Devin Booker', 'Luka Doncic', 'Donovan Mitchell', 
'Dejounte Murray', 'Chris Paul', 'Draymond Green', 'Rudy Gobert', 'Karl-Anthony Towns']), "ALLSTAR"] = 1

In [None]:
df[df['Player'].isin(['Trae Young', 'DeMar DeRozan', 'Joel Embiid', 'Kevin Durant', 'Giannis Antetokounmpo', 'LaMelo Ball', 
'Darius Garland', 'James Harden', 'Zach LaVine', 'Fred VanVleet', 'Jimmy Butler', 'Khris Middleton', 'Jayson Tatum', 'Jarrett Allen',
'Stephen Curry', 'Ja Morant', 'Nikola Jokic', 'LeBron James', 'Andrew Wiggins', 'Devin Booker', 'Luka Doncic', 'Donovan Mitchell', 
'Dejounte Murray', 'Chris Paul', 'Draymond Green', 'Rudy Gobert', 'Karl-Anthony Towns'])]

In [None]:
df[df['Player'].isin(['Klay Thompson'])]

#### Dealing with traded players

In [None]:
dups=df.duplicated()
print(df[dups])

In [None]:
tradedPlayers = df[df.duplicated(['Player'])]
print(tradedPlayers)

In [None]:
tradedPlayers[tradedPlayers['Player'] == 'Nickeil Alexander-Walker']

In [None]:
df[df['Player'] == 'Nickeil Alexander-Walker']

In [None]:
df[df['Tm'] == 'TOT']

In [None]:
df = df.drop_duplicates(subset=['Player'])
df[df['Tm'] == 'TOT']

#### Removing the players which played less than 10 games

In [None]:
df[df['G']<10]

In [None]:
df = df.drop(df[df['G'] < 10].index)

In [None]:
df.describe

#### Checking for missing values

In [None]:
print("Dataset contains total of %d missing values"%int(df.shape[0] - df.dropna().shape[0]))

### EDA

#### Basic analysis

In [None]:
df.corr()

In [None]:
fig, ax = plt.subplots(figsize=(15,12))
sns.heatmap(df.corr(), ax=ax, annot=True, fmt=".1f")
plt.savefig('heatmap.png')

### Analysis by attribute

In [None]:
not_AS = df[df['ALLSTAR'] == 0]
a_stars = df[df['ALLSTAR'] == 1]

In [None]:
def iqr(attribute): 
    return df[attribute].quantile(0.75) - df[attribute].quantile(0.25) 

def outliers(attribute):
    iqrange = iqr(attribute)
    lower_bound = df[attribute].quantile(0.25) -(1.5 * iqrange)
    upper_bound = df[attribute].quantile(0.75) +(1.5 * iqrange)
    return df[(df[attribute] > upper_bound) | (df[attribute] < lower_bound)]

In [None]:
def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['medians'], color=color)

#### Age

In [None]:
sns.boxplot('Age', data=df)

In [None]:
print("Number of outliers for age is: ",len(outliers('Age')))

In [None]:
df['Age'].describe()

In [None]:
bins = np.arange(0, 42, 2)

plt.hist(not_AS['Age'], bins, alpha=0.5, label='Not AS')
plt.hist(a_stars['Age'], bins, alpha=0.5, label='AS')
plt.legend(loc='upper right')
plt.xlabel('Age')
plt.show()

In [None]:
plt.figure()

ticks = ["Age"]

bpna = plt.boxplot(not_AS['Age'], positions=[0],widths=0.6)
bpas = plt.boxplot(a_stars['Age'], positions=[1],widths=0.6)

set_box_color(bpna, '#2C7BB6')
set_box_color(bpas, '#e6ac0e') 

plt.plot([], c='#2C7BB6', label='Not All Stars')
plt.plot([], c='#e6ac0e', label='All Stars')

plt.legend()

plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, 6)
plt.ylim(0, 45)

In [None]:
plt.xticks(np.arange(18, 42, 1))
plt.yticks(np.arange(0, 100, 10))
df['Age'].hist(bins=22)

In [None]:
stats.shapiro(df.Age)

#### Games

In [None]:
sns.boxplot('G', data=df)

In [None]:
print("Number of outliers for games played is: ",len(outliers('G')))

In [None]:
df['G'].describe()

In [None]:
bins = np.arange(0, 66, 2)

plt.hist(not_AS['G'], bins, alpha=0.5, label='Not AS')
plt.hist(a_stars['G'], bins, alpha=0.5, label='AS')
plt.legend(loc='upper right')
plt.xlabel('Games Played')
plt.savefig('G-hist.png')
plt.show()

In [None]:
plt.figure()

ticks = ["Games Played"]

bpna = plt.boxplot(not_AS['G'], positions=[0],widths=0.6)
bpas = plt.boxplot(a_stars['G'], positions=[1],widths=0.6)

set_box_color(bpna, '#2C7BB6')
set_box_color(bpas, '#e6ac0e') 

plt.plot([], c='#2C7BB6', label='Not All Stars')
plt.plot([], c='#e6ac0e', label='All Stars')

plt.legend()

plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, 6)
plt.ylim(0, 70)
plt.savefig('G-box.png')

In [None]:
plt.xticks(np.arange(0, 60, 2))
plt.yticks(np.arange(0, 100, 10))
df['G'].hist(bins=60, figsize=(10, 6))

In [None]:
stats.shapiro(df.G)

#### Minutes played

In [None]:
sns.boxplot('MP', data=df)

In [None]:
print("Number of outliers for minutes played is: ",len(outliers('MP')))

In [None]:
df['MP'].describe()

In [None]:
bins = np.arange(0, 60, 2)

plt.hist(not_AS['MP'], bins, alpha=0.5, label='Not AS')
plt.hist(a_stars['MP'], bins, alpha=0.5, label='AS')
plt.legend(loc='upper right')
plt.xlabel('Minutes Played')
plt.savefig('MP-hist.png')
plt.show()


In [None]:
plt.figure()

ticks = ["Minutes Played"]

bpna = plt.boxplot(not_AS['MP'], positions=[0],widths=0.6)
bpas = plt.boxplot(a_stars['MP'], positions=[1],widths=0.6)

set_box_color(bpna, '#2C7BB6')
set_box_color(bpas, '#e6ac0e') 

plt.plot([], c='#2C7BB6', label='Not All Stars')
plt.plot([], c='#e6ac0e', label='All Stars')

plt.legend()

plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, 6)
plt.ylim(0, 50)
plt.savefig('MP-box.png')

In [None]:
plt.xticks(np.arange(0, 40, 2))
plt.yticks(np.arange(0, 40, 10))
df['MP'].hist(bins=60, figsize=(10, 6))

In [None]:
stats.shapiro(df.MP)

#### Points per game

In [None]:
sns.boxplot('PTS', data=df)

In [None]:
print("Number of outliers for points scored is: ",len(outliers('PTS')))

In [None]:
df['PTS'].describe()

In [None]:
bins = np.arange(0, 40, 2)

plt.hist(not_AS['PTS'], bins, alpha=0.5, label='Not AS')
plt.hist(a_stars['PTS'], bins, alpha=0.5, label='AS')
plt.legend(loc='upper right')
plt.xlabel('Points per game')
plt.savefig('PTS-hist.png')
plt.show()

In [None]:
plt.figure()

ticks = ["Points per game"]

bpna = plt.boxplot(not_AS['PTS'], positions=[0],widths=0.6)
bpas = plt.boxplot(a_stars['PTS'], positions=[1],widths=0.6)

set_box_color(bpna, '#2C7BB6')
set_box_color(bpas, '#e6ac0e') 

plt.plot([], c='#2C7BB6', label='Not All Stars')
plt.plot([], c='#e6ac0e', label='All Stars')

plt.legend()

plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, 6)
plt.ylim(0, 40)
plt.savefig('PTS-box.png')

In [None]:
plt.xticks(np.arange(0, 35, 2))
plt.yticks(np.arange(0, 40, 10))
df['PTS'].hist(bins=60, figsize=(10, 6))

In [None]:
stats.shapiro(df.PTS)

#### Shooting efficiency eFG%

In [None]:
sns.boxplot('eFG%', data=df)

In [None]:
print("Number of outliers for shooting efficiency is: ",len(outliers('eFG%')))

In [None]:
df['eFG%'].describe()

In [None]:
bins = np.arange(0.0, 1.0, 0.05)

plt.hist(not_AS['eFG%'], bins, alpha=0.5, label='Not AS')
plt.hist(a_stars['eFG%'], bins, alpha=0.5, label='AS')
plt.legend(loc='upper right')
plt.xlabel('Shooting efficiency')
plt.savefig('Efg-hist.png')
plt.show()

In [None]:
plt.figure()

ticks = ["Shooting efficiency"]

bpna = plt.boxplot(not_AS['eFG%'], positions=[0],widths=0.6)
bpas = plt.boxplot(a_stars['eFG%'], positions=[1],widths=0.6)

set_box_color(bpna, '#2C7BB6')
set_box_color(bpas, '#e6ac0e') 

plt.plot([], c='#2C7BB6', label='Not All Stars')
plt.plot([], c='#e6ac0e', label='All Stars')

plt.legend()

plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, 6)
plt.ylim(0., 1.5)
plt.savefig('Efg-box.png')

## Predicting PTS - points per game

### Further analysis of the attributes and their distributions and correlations

In [None]:
stats.probplot(df['PTS'], dist="norm", plot=pylab)
plt.title('PTS - Probability plot')
plt.savefig('PTS-qq.png')
pylab.show()

In [None]:
pts_shapiro = stats.shapiro(df.PTS)
tov_shapiro = stats.shapiro(df.TOV)
stl_shapiro = stats.shapiro(df.STL)
pf_shapiro = stats.shapiro(df.PF)
mp_shapiro = stats.shapiro(df.MP)
ft_shapiro = stats.shapiro(df.FT)
ast_shapiro = stats.shapiro(df.AST)

print("Shapiro PTS: Statistics: ", pts_shapiro.statistic , " P-values", format( pts_shapiro.pvalue, '.28f'))
print("Shapiro TOV: Statistics: ", tov_shapiro.statistic , " P-values", format( tov_shapiro.pvalue, '.28f'))
print("Shapiro STL: Statistics: ", stl_shapiro.statistic , " P-values", format( stl_shapiro.pvalue, '.28f'))
print("Shapiro PF: Statistics: ", pf_shapiro.statistic , " P-values", format( pf_shapiro.pvalue, '.28f'))
print("Shapiro MP: Statistics: ", mp_shapiro.statistic , " P-values", format( mp_shapiro.pvalue, '.28f'))
print("Shapiro FT: Statistics: ", ft_shapiro.statistic , " P-values", format( ft_shapiro.pvalue, '.28f'))
print("Shapiro AST: Statistics: ", ast_shapiro.statistic , " P-values", format( ast_shapiro.pvalue, '.28f'))


In [None]:
def powerpuffgirls(data, method):
    power = PowerTransformer(method=method, standardize=True) 
    data_trans = power.fit_transform(data.copy())
    return data_trans

def best_transform(column,data):
    pvalues=dict()

    pvalues['data']=[stats.shapiro(data[column])[1],data]
       
    data_y=data.copy()
    data_y[column]=powerpuffgirls(data_y[[column]],"yeo-johnson")
    pvalues['data_y']=[stats.shapiro( data_y[column])[1],data_y]
    
    
    if (any(data[column]<=0)==False):
        data_b=data.copy()
        data_b[column]=powerpuffgirls(data_b[[column]],"box-cox")
        pvalues['data_b']=[stats.shapiro( data_b[column])[1],data_b]
    
    sorted_pvalues=sorted(pvalues.items(),key=operator.itemgetter(0))
    
    return(sorted_pvalues[-1][1][1])


In [None]:
col_list = ['PTS', 'AST', 'TOV', 'FT', 'MP', 'STL','PF']

for col in col_list:
  print('------------')
  print(col)
  transformed = best_transform(col, df)
  t_shapiro = stats.shapiro(transformed[col])
  print("Shapiro ", col, " : Statistics: ", t_shapiro.statistic , " P-values", format( t_shapiro.pvalue, '.28f'))
  

  


#### Personal fouls

In [None]:
sns.regplot(x="PTS", y="PF", data=df)
plt.title('PF scatter distribution')
plt.savefig('PF-scat.png')
print("Pearson correlation: %.3f" % df['PTS'].corr(df['PF']))

In [None]:
stats.probplot(df['PF'], dist="norm", plot=pylab)
pylab.show()

#### Turn overs

In [None]:
sns.regplot(x="PTS", y="TOV", data=df)
print("Pearson correlation: %.3f" % df['PTS'].corr(df['TOV']))
plt.savefig('PTS-TOV-corr.png')

In [None]:
stats.probplot(df['TOV'], dist="norm", plot=pylab)
plt.title('TOV - Probability plot')
plt.savefig('TOV-qq.png')
pylab.show()

#### Minutes played

In [None]:
sns.regplot(x="PTS", y="MP", data=df)
print("Pearson correlation: %.3f" % df['PTS'].corr(df['MP']))
plt.savefig('PTS-MP-corr.png')

In [None]:
stats.probplot(df['MP'], dist="norm", plot=pylab)
plt.title('MP - Probability plot')
plt.savefig('MP-qq.png')
pylab.show()

#### Free throws made

In [None]:
sns.regplot(x="PTS", y="FT", data=df)
print("Pearson correlation: %.3f" % df['PTS'].corr(df['FT']))
plt.savefig('PTS-FT-corr.png')

In [None]:
stats.probplot(df['FT'], dist="norm", plot=pylab)
plt.title('FT - Probability plot')
plt.savefig('FT-qq.png')
pylab.show()

#### Steals

In [None]:
sns.regplot(x="PTS", y="STL", data=df)
plt.title('STL scatter distribution')
plt.savefig('STL-scat.png')
print("Pearson correlation: %.3f" % df['PTS'].corr(df['STL']))

In [None]:
stats.probplot(df['STL'], dist="norm", plot=pylab)
pylab.show()

#### Assists

In [None]:
sns.regplot(x="PTS", y="AST", data=df)
print("Pearson correlation: %.3f" % df['PTS'].corr(df['AST']))
plt.savefig('PTS-AST-corr.png')

In [None]:
stats.probplot(df['AST'], dist="norm", plot=pylab)
plt.title('AST - Probability plot')
plt.savefig('AST-qq.png')
pylab.show()

#### Choosing the predictors to be used

In [None]:
data = df[['PTS', 'AST', 'TOV', 'FT', 'MP']]
data.head(9)

In [None]:
axes = pd.plotting.scatter_matrix(data, alpha=0.2)
plt.tight_layout()
plt.savefig('scatter_matrix.png')

### Prediction made while using all attributes

In [None]:
def resid_df(y_test, predict):
  reses_abs = []
  reses = []
  for i in range(0,np.size(predict)):
    reses_abs.append(abs(y_test[i] - predict[i]))
  res_dict = {'Actual':y_test, 'Predicted':predict, 'Residual (ABS)':reses_abs, 'Reses':reses}
  res_dict_df = pd.DataFrame(data=res_dict)
  return res_dict_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['AST', 'TOV', 'FT', 'MP']], data['PTS'], test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
regr.coef_

In [None]:
regr.intercept_

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',poly_reg_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints))

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.savefig('lin_points.png')

In [None]:
np.amin(y_test)

In [None]:
np.amin(predictedPoints)

In [None]:
(predictedPoints<0).sum()

In [None]:
df['PTS'].describe()

Crossvalidation

In [None]:
X, y = data[['TOV', 'FT', 'MP', 'AST']], data['PTS']
regrCS = linear_model.LinearRegression()
scores = cross_val_score(regrCS, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores = [x * -1 for x in scores]

print(scores)
print('RMSE: %.3f (%.3f)' % (mean(scores), std(scores)))

#### Polynomial

In [None]:
X, y = data[['AST', 'TOV', 'FT', 'MP']], data['PTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_reg = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
pol_reg.coef_

In [None]:
pol_reg.intercept_

In [None]:
poly_all_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_all_pred))
print('RMSE:',poly_reg_rmse)
print('MSE:',mean_squared_error(y_test, poly_all_pred) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_all_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.savefig('poly_points.png')


Crossvalidation

In [None]:
X, y = data[['TOV', 'FT', 'MP', 'AST']], data['PTS']
poly_reg = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_reg.fit_transform(X)
pol_reg_cs = linear_model.LinearRegression()
scores = cross_val_score(pol_reg_cs, X_poly, y, cv=5, scoring='neg_root_mean_squared_error')
scores = [x * -1 for x in scores]

print(scores)
print('RMSE: %.3f (%.3f)' % (mean(scores), std(scores)))

### Prediction after removing AST

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['TOV', 'FT', 'MP']], data['PTS'], test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
regr.coef_

In [None]:
regr.intercept_

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',poly_reg_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints))

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.savefig('lin_points_noassist.png')

In [None]:
np.amin(y_test)

In [None]:
np.amin(predictedPoints)

In [None]:
(predictedPoints<0).sum()

In [None]:
df['PTS'].describe()

Crossvalidation

In [None]:
X, y = data[['TOV', 'FT', 'MP']], data['PTS']
regrCS = linear_model.LinearRegression()
scores = cross_val_score(regrCS, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores = [x * -1 for x in scores]

print(scores)
print('RMSE: %.3f (%.3f)' % (mean(scores), std(scores)))

#### Polynomial

In [None]:
X, y = data[['TOV', 'FT', 'MP']], data['PTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_reg = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
pol_reg.coef_

In [None]:
pol_reg.intercept_

In [None]:
poly_all_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_all_pred))
print('RMSE:',poly_reg_rmse)
print('MSE:',mean_squared_error(y_test, poly_all_pred) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_all_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.savefig('poly_points_noassist.png')


Crossvalidation

In [None]:
X, y = data[['TOV', 'FT', 'MP']], data['PTS']
poly_reg = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_reg.fit_transform(X)
pol_reg_cs = linear_model.LinearRegression()
scores = cross_val_score(pol_reg_cs, X_poly, y, cv=5, scoring='neg_root_mean_squared_error')
scores = [x * -1 for x in scores]

print(scores)
print('RMSE: %.3f (%.3f)' % (mean(scores), std(scores)))

#### Testing legends

In [None]:
X, y = data[['TOV', 'FT', 'MP']], data['PTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_reg = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
legends = {'TOV':[3.1, 3.3, 3.1, 3.3, 2.6],'FT':[8.8, 4.6, 8.7, 7.6, 7.7], 'MP':[40.4, 34.2, 41.0, 42.5, 39.4], 'PTS':[35.0, 30.1, 35.4, 31.4, 32.1]}
legends_df = pd.DataFrame(data=legends)
X_leg = legends_df[['TOV', 'FT', 'MP']]
Y_leg = legends_df['PTS']
legends_pred = pol_reg.predict(poly_reg.fit_transform(X_leg))

leg_rmse = np.sqrt(mean_squared_error(Y_leg, legends_pred))
print('RMSE:',leg_rmse)
print('MSE:',mean_squared_error(Y_leg, legends_pred))

print('True:', Y_leg)
print('Predictions:', legends_pred)

In [None]:
legends_df.describe()

### Single attribute predictions

#### MP

In [None]:
X = data[['MP']]
Y = data['PTS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train, color = "red")
plt.plot(X_train, regr.predict(X_train), color = "green")
plt.title("Points Based on Minutes Played (Linear - Train)")
plt.xlabel("Minutes Played")
plt.ylabel("Points")
plt.show()

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
plt.scatter(X_test, y_test, color = "red")
plt.plot(X_test, regr.predict(X_test), color = "green")
plt.title("Points Based on Minutes Played (Linear - Test)")
plt.xlabel("Minutes Played")
plt.ylabel("Points")
plt.savefig('minutes_lin_points.png')
plt.show()

In [None]:
lin_min_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',lin_min_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)

In [None]:
poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
poly_min_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_min_rmse = np.sqrt(mean_squared_error(y_test, poly_min_pred))
print('RMSE:',poly_min_rmse)
print('MSE:',mean_squared_error(y_test, poly_min_pred) )

In [None]:
X_test.head()

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test)
plt.scatter(X_test, poly_min_pred, c='red')
plt.title("Points Based on Minutes Played (Polynomial - Test)")
plt.xlabel("Minutes Played")
plt.ylabel("Points")
plt.savefig('minutes_poly_points.png')
plt.show()

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_min_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.savefig('minutes_poly_points_acc.png')

#### AST

In [None]:
X = data[['AST']]
Y = data['PTS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train, color = "red")
plt.plot(X_train, regr.predict(X_train), color = "green")
plt.title("Points Based on Assists (Linear - Train)")
plt.xlabel("Assists")
plt.ylabel("Points")
plt.show()

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
plt.scatter(X_test, y_test, color = "red")
plt.plot(X_test, regr.predict(X_test), color = "green")
plt.title("Points Based on Assists (Linear - Test)")
plt.xlabel("Assists")
plt.ylabel("Points")
plt.savefig('ast_lin_points.png')
plt.show()

In [None]:
lin_ast_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',lin_ast_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)

In [None]:
poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
poly_ast_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_ast_rmse = np.sqrt(mean_squared_error(y_test, poly_ast_pred))
print('RMSE:',poly_ast_rmse)
print('MSE:',mean_squared_error(y_test, poly_ast_pred) )

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test)
plt.scatter(X_test, poly_ast_pred, c='red')
plt.show()

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_ast_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)

#### TOV

In [None]:
X = data[['TOV']]
Y = data['PTS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train, color = "red")
plt.plot(X_train, regr.predict(X_train), color = "green")
plt.title("Points Based on Turnovers (Linear - Train)")
plt.xlabel("Turnovers")
plt.ylabel("Points")
plt.show()

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
plt.scatter(X_test, y_test, color = "red")
plt.plot(X_test, regr.predict(X_test), color = "green")
plt.title("Points Based on Turnovers (Linear - Test)")
plt.xlabel("Turnovers")
plt.ylabel("Points")
plt.show()

In [None]:
lin_ast_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',lin_ast_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)

In [None]:
poly_reg = PolynomialFeatures(degree=6)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
poly_tov_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_tov_rmse = np.sqrt(mean_squared_error(y_test, poly_tov_pred))
print('RMSE:',poly_tov_rmse)
print('MSE:',mean_squared_error(y_test, poly_tov_pred) )

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test)
plt.scatter(X_test, poly_tov_pred, c='red')
plt.title("Points Based on Turnovers (Polynomial - Test)")
plt.xlabel("Turnovers")
plt.ylabel("Points")
plt.savefig('tov_poly_points.png')
plt.show()

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_tov_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)

#### FT

In [None]:
X = data[['FT']]
Y = data['PTS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train, color = "red")
plt.plot(X_train, regr.predict(X_train), color = "green")
plt.title("Points Based on Free throws (Linear - Train)")
plt.xlabel("Free throws")
plt.ylabel("Points")
plt.show()

In [None]:
predictedPoints = regr.predict(X_test)

In [None]:
plt.scatter(X_test, y_test, color = "red")
plt.plot(X_test, regr.predict(X_test), color = "green")
plt.title("Points Based on Free throws (Linear - Test)")
plt.xlabel("Free throws")
plt.ylabel("Points")
plt.show()

In [None]:
lin_ast_rmse = np.sqrt(mean_squared_error(y_test, predictedPoints))
print('RMSE:',lin_ast_rmse)
print('MSE:',mean_squared_error(y_test, predictedPoints) )

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(predictedPoints, hist=False, color="b", label="Fitted Values" , ax=ax1)

In [None]:
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X_train)
pol_reg = linear_model.LinearRegression()
pol_reg.fit(X_poly, y_train)

In [None]:
poly_ft_pred = pol_reg.predict(poly_reg.fit_transform(X_test))

In [None]:
poly_ft_rmse = np.sqrt(mean_squared_error(y_test, poly_ft_pred))
print('RMSE:',poly_ft_rmse)
print('MSE:',mean_squared_error(y_test, poly_ft_pred) )

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test)
plt.scatter(X_test, poly_ft_pred, c='red')
plt.title("Points Based on Free throws (Polynomial - Test)")
plt.xlabel("Free throws")
plt.ylabel("Points")
plt.savefig('ft_poly_points.png')
plt.show()

In [None]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(poly_ft_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)

## ALL STAR

#### Logistic

In [None]:
def class_prediction(predictions_prob, predictions, y_test):
  pred_class_1 = []
  pred_class_2 = []
  for pred in predictions_prob:
    pred_class_1.append(pred[0])
    pred_class_2.append(pred[1])
  pred_dict = {'N_ALL':pred_class_1, 'ALL':pred_class_2, 'Predicted class':predictions, 'Actual class':y_test}
  pred_dict_df = pd.DataFrame(data=pred_dict)
  return pred_dict_df

In [None]:
def print_confusion_matrix(predictions, y_test, label, model):
  cf_matrix = metrics.confusion_matrix(y_test, predictions)
  ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
  title = 'ALL star based on ' + label + ' Confusion Matrix\n\n'
  file_name = 'confusion_'+ label+'_'+ model+'.png'
  ax.set_title(title)
  ax.set_xlabel('\nPredicted Values')
  ax.set_ylabel('Actual Values ')

  ## Ticket labels - List must be in alphabetical order
  ax.xaxis.set_ticklabels(['Not All Star','All Star'])
  ax.yaxis.set_ticklabels(['Not All Star','All Star'])
  plt.savefig(file_name)
  ## Display the visualization of the Confusion Matrix.
  plt.show()

In [None]:
target_names = ['N_ALL', 'ALL']

Points

In [None]:
X = df[['PTS']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logisticRegrPTS = LogisticRegression()
logisticRegrPTS.fit(X_train, y_train)

In [None]:
predictions_prob = logisticRegrPTS.predict_proba(X_test)

In [None]:
predictions = logisticRegrPTS.predict(X_test)

In [None]:
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Points', 'logistic')

In [None]:
y_pred_proba = logisticRegrPTS.predict_proba(X_test)[::,1]
fprPTS, tprPTS, _ = metrics.roc_curve(y_test,  y_pred_proba)
aucPTS = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fprPTS,tprPTS,label="AUC="+str(aucPTS))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

Crossvalid

In [None]:
X = df[['PTS']]
Y = df['ALLSTAR']

cross_logisticRegrPTS = LogisticRegression()
scores = cross_val_score(cross_logisticRegrPTS, X, Y)
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Games

In [None]:
X = df[['G']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logisticRegrGS = LogisticRegression()
logisticRegrGS.fit(X_train, y_train)

In [None]:
predictions_prob = logisticRegrGS.predict_proba(X_test)

In [None]:
predictions = logisticRegrGS.predict(X_test)

In [None]:
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Games_Played', 'logistic')

In [None]:
y_pred_proba = logisticRegrGS.predict_proba(X_test)[::,1]
fprGS, tprGS, _ = metrics.roc_curve(y_test,  y_pred_proba)
aucGS = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fprGS,tprGS,label="AUC="+str(aucGS))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

Minutes

In [None]:
X = df[['MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logisticRegrMP = LogisticRegression()
logisticRegrMP.fit(X_train, y_train)

In [None]:
predictions_prob = logisticRegrMP.predict_proba(X_test)

In [None]:
predictions = logisticRegrMP.predict(X_test)

In [None]:
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Minutes_Played', 'logistic')

In [None]:
y_pred_proba = logisticRegrMP.predict_proba(X_test)[::,1]
fprMP, tprMP, _ = metrics.roc_curve(y_test,  y_pred_proba)
aucMP = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fprMP,tprMP,label="AUC="+str(aucMP))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

All

In [None]:
X = df[['PTS', 'G', 'MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logisticRegrALL = LogisticRegression()
logisticRegrALL.fit(X_train, y_train)

In [None]:
predictions_prob = logisticRegrALL.predict_proba(X_test)

In [None]:
predictions = logisticRegrALL.predict(X_test)

In [None]:
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'All', 'logistic')

In [None]:
y_pred_proba = logisticRegrALL.predict_proba(X_test)[::,1]
fprALL, tprALL, _ = metrics.roc_curve(y_test,  y_pred_proba)
aucALL = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fprALL,tprALL,label="AUC="+str(aucALL))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

Crossvalid

In [None]:

X = df[['PTS', 'G', 'MP']]
Y = df['ALLSTAR']

cross_logisticRegrALL = LogisticRegression()
scores = cross_val_score(cross_logisticRegrALL, X, Y)
print(scores)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

PTS + MP

In [None]:
X = df[['PTS', 'MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logisticRegrPMP = LogisticRegression()
logisticRegrPMP.fit(X_train, y_train)

In [None]:
predictions_prob = logisticRegrPMP.predict_proba(X_test)

In [None]:
predictions = logisticRegrPMP.predict(X_test)

In [None]:
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Points_Minutes', 'logistic')

In [None]:
y_pred_proba = logisticRegrPMP.predict_proba(X_test)[::,1]
fprPMP, tprPMP, _ = metrics.roc_curve(y_test,  y_pred_proba)
aucPMP = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve
plt.plot(fprPMP,tprPMP,label="AUC="+str(aucPMP))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

All logistic curves

In [None]:
plt.plot(fprPMP,tprPMP,label="AUC_PMP="+str(aucPMP))
plt.plot(fprPTS,tprPTS,label="AUC_PTS="+str(aucPTS))
plt.plot(fprGS,tprGS,label="AUC_Games="+str(aucGS))
plt.plot(fprMP,tprMP,label="AUC_MP="+str(aucMP))
plt.plot(fprALL,tprALL,label="AUC_ALL="+str(aucALL))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC curves logistic regression')
plt.legend(loc=4)
plt.savefig('ROC_logistic.png')
plt.show()

#### SVM

In [None]:
kernels = ['linear', 'sigmoid', 'rbf']
target_names = ['N_ALL', 'ALL']

In [None]:
def all_kernels(X_train, y_train, X_test, y_test):
  for krnl in kernels:
    svcclassifier = SVC(kernel=krnl)
    svcclassifier.fit(X_train,y_train)
    predictions = svcclassifier.predict(X_test)
    print('Kernel: ', krnl)
    print(classification_report(y_test, predictions, target_names=target_names))

Points

In [None]:
X = df[['PTS']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
all_kernels(X_train, y_train, X_test, y_test)

Crossvalid

In [None]:
X = df[['PTS']]
Y = df['ALLSTAR']

cross_SVM_RBF_PTS = SVC(kernel='rbf')
RBF_scores = cross_val_score(cross_SVM_RBF_PTS, X, Y)

cross_SVM_LIN_PTS = SVC(kernel='linear')
lin_scores = cross_val_score(cross_SVM_LIN_PTS, X, Y)

print('--------------')
print('RBF')
print(RBF_scores)
print('Accuracy: %.3f (%.3f)' % (mean(RBF_scores), std(RBF_scores)))
print('--------------')
print('Linear')
print(lin_scores)
print('Accuracy: %.3f (%.3f)' % (mean(lin_scores), std(lin_scores)))

najlepsie RBF

In [None]:
svcclassifierPTS = SVC(kernel='rbf', probability=True)
svcclassifierPTS.fit(X_train,y_train)

In [None]:
predictions_prob = svcclassifierPTS.predict_proba(X_test)

In [None]:
predictions = svcclassifierPTS.predict(X_test)

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Points', 'svm')

In [None]:
fprPTS_SVM, tprPTS_SVM, _ = metrics.roc_curve(y_test,  predictions_prob[::,1])
aucPTS_SVM = metrics.roc_auc_score(y_test, predictions_prob[::,1])

#create ROC curve
plt.plot(fprPTS_SVM,tprPTS_SVM,label="AUC="+str(aucPTS_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

Games

In [None]:
X = df[['G']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
all_kernels(X_train, y_train, X_test, y_test)

najlepsie RBF

In [None]:
svcclassifierGS = SVC(kernel='rbf', probability=True)
svcclassifierGS.fit(X_train,y_train)

In [None]:
predictions_prob = svcclassifierGS.predict_proba(X_test)

In [None]:
predictions = svcclassifierGS.predict(X_test)

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Games_Played', 'svm')

In [None]:
fprGS_SVM, tprGS_SVM, _ = metrics.roc_curve(y_test,  predictions_prob[::,1])
aucGS_SVM = metrics.roc_auc_score(y_test, predictions_prob[::,1])

#create ROC curve
plt.plot(fprGS_SVM,tprGS_SVM,label="AUC="+str(aucGS_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

Minutes

In [None]:
X = df[['MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
all_kernels(X_train, y_train, X_test, y_test)

najlepsie RBF

In [None]:
svcclassifierMP = SVC(kernel='rbf', probability=True)
svcclassifierMP.fit(X_train,y_train)

In [None]:
predictions_prob = svcclassifierMP.predict_proba(X_test)

In [None]:
predictions = svcclassifierMP.predict(X_test)

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'Minutes_Played', 'svm')

In [None]:
fprMP_SVM, tprMP_SVM, _ = metrics.roc_curve(y_test,  predictions_prob[::,1])
aucMP_SVM = metrics.roc_auc_score(y_test, predictions_prob[::,1])

#create ROC curve
plt.plot(fprMP_SVM,tprMP_SVM,label="AUC="+str(aucMP_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

ALL

In [None]:
X = df[['PTS','G','MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
all_kernels(X_train, y_train, X_test, y_test)

Crossvalidation

In [None]:
X = df[['PTS','G','MP']]
Y = df['ALLSTAR']

cross_SVM_RBF_ALL = SVC(kernel='rbf')
RBF_scores = cross_val_score(cross_SVM_RBF_ALL, X, Y)

cross_SVM_LIN_ALL = SVC(kernel='linear')
lin_scores = cross_val_score(cross_SVM_LIN_ALL, X, Y)

print('--------------')
print('RBF')
print(RBF_scores)
print('Accuracy: %.3f (%.3f)' % (mean(RBF_scores), std(RBF_scores)))
print('--------------')
print('Linear')
print(lin_scores)
print('Accuracy: %.3f (%.3f)' % (mean(lin_scores), std(lin_scores)))

najlepsie linear

In [None]:
svcclassifierALL = SVC(kernel='linear', probability=True)
svcclassifierALL.fit(X_train,y_train)

In [None]:
predictions_prob = svcclassifierALL.predict_proba(X_test)

In [None]:
predictions = svcclassifierALL.predict(X_test)

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'All', 'svm')

In [None]:
fprALL_SVM, tprALL_SVM, _ = metrics.roc_curve(y_test,  predictions_prob[::,1])
aucALL_SVM = metrics.roc_auc_score(y_test, predictions_prob[::,1])

#create ROC curve
plt.plot(fprALL_SVM,tprALL_SVM,label="AUC="+str(aucALL_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

PTS + Minutes

In [None]:
X = df[['PTS','MP']]
Y = df['ALLSTAR']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
all_kernels(X_train, y_train, X_test, y_test)

Crossvalidation

In [None]:
X = df[['PTS','MP']]
Y = df['ALLSTAR']

cross_SVM_RBF_PMP = SVC(kernel='rbf')
RBF_scores = cross_val_score(cross_SVM_RBF_PMP, X, Y)

cross_SVM_LIN_PMP = SVC(kernel='linear')
lin_scores = cross_val_score(cross_SVM_LIN_PMP, X, Y)

print('--------------')
print('RBF')
print(RBF_scores)
print('Accuracy: %.3f (%.3f)' % (mean(RBF_scores), std(RBF_scores)))
print('--------------')
print('Linear')
print(lin_scores)
print('Accuracy: %.3f (%.3f)' % (mean(lin_scores), std(lin_scores)))

najlepsie

In [None]:
svcclassifierPMP = SVC(kernel='rbf', probability=True)
svcclassifierPMP.fit(X_train,y_train)

In [None]:
predictions_prob = svcclassifierPMP.predict_proba(X_test)

In [None]:
predictions = svcclassifierPMP.predict(X_test)

In [None]:
class_df = class_prediction(predictions_prob, predictions, y_test)
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_rows', None)
class_df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
print_confusion_matrix(predictions, y_test, 'PMP', 'svm')

In [None]:
fprPMP_SVM, tprPMP_SVM, _ = metrics.roc_curve(y_test,  predictions_prob[::,1])
aucPMP_SVM = metrics.roc_auc_score(y_test, predictions_prob[::,1])

#create ROC curve
plt.plot(fprPMP_SVM,tprPMP_SVM,label="AUC="+str(aucPMP_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

All SVM curves

In [None]:
plt.plot(fprPTS_SVM,tprPTS_SVM,label="AUC_PTS_SVM="+str(aucPTS_SVM))
plt.plot(fprGS_SVM,tprGS_SVM,label="AUC_Games_SVM="+str(aucGS_SVM))
plt.plot(fprMP_SVM,tprMP_SVM,label="AUC_MP_SVM="+str(aucMP_SVM))
plt.plot(fprALL_SVM,tprALL_SVM,label="AUC_ALL_SVM="+str(aucALL_SVM))
plt.plot(fprPMP_SVM,tprPMP_SVM,label="AUC="+str(aucPMP_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC curves SVM')
plt.legend(loc=4)
plt.savefig('ROC_svm.png')
plt.show()

All curves

In [None]:
plt.figure(figsize=(15,10))
plt.plot(fprPMP,tprPMP,label="AUC_PMP="+str(aucPMP))
plt.plot(fprPTS,tprPTS,label="AUC_PTS="+str(aucPTS))
plt.plot(fprGS,tprGS,label="AUC_Games="+str(aucGS))
plt.plot(fprMP,tprMP,label="AUC_MP="+str(aucMP))
plt.plot(fprALL,tprALL,label="AUC_ALL="+str(aucALL))
plt.plot(fprPTS_SVM,tprPTS_SVM,label="AUC_PTS_SVM="+str(aucPTS_SVM))
plt.plot(fprGS_SVM,tprGS_SVM,label="AUC_Games_SVM="+str(aucGS_SVM))
plt.plot(fprMP_SVM,tprMP_SVM,label="AUC_MP_SVM="+str(aucMP_SVM))
plt.plot(fprALL_SVM,tprALL_SVM,label="AUC_ALL_SVM="+str(aucALL_SVM))
plt.plot(fprPMP_SVM,tprPMP_SVM,label="AUC_PMP_SVM="+str(aucPMP_SVM))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC curves ALL')
plt.legend(loc=4)
plt.savefig('ROC_all.png')
plt.show()