In [38]:
import pandas as pd
import numpy as np
import sklearn
import math
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv('model_data.csv')

## Pre-Processing

In [40]:
df.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', '3P%', '2P%',
       'FT%', 'TmNetRtg', 'Next Rtg', 'Next WS', 'Veteran Value', 'VV Class',
       'Starters'],
      dtype='object')

In [41]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [42]:
target = df['Veteran Value']
target_class = df['VV Class']

In [43]:
df.drop(['Unnamed: 0','Age','Year','Player','Pos','Tm','Next WS','WS','Next Rtg','TmNetRtg','Veteran Value', 'VV Class'], axis=1, inplace=True)

In [44]:
df.columns

Index(['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS/48', 'OBPM', 'DBPM', '3P%',
       '2P%', 'FT%', 'Starters'],
      dtype='object')

In [45]:
#try scaling values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)

In [46]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(scaled_data, target.values, random_state=42)
xctrain, xctest, yctrain, yctest = train_test_split(scaled_data, target_class.values, random_state=42)

### Linear Regression

In [76]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
lm.fit(xtrain,ytrain)
ypred = lm.predict(xtest)
print(np.sqrt(mean_squared_error(ytest, ypred)))
print(r2_score(ytest,ypred))

32181.12418068284
0.02987714878159875


In [48]:
print(pd.Series(lm.coef_, index=df.columns))

G           -4083.951267
MP         -28598.442588
PER        -16335.335248
TS%          3434.489327
3PAr         3561.355608
FTr          1596.611683
ORB%        -4741.857690
DRB%         3884.664849
AST%         6155.749169
STL%        -1836.319296
BLK%         3653.151486
TOV%        -1185.857592
USG%         5883.394587
OWS         58179.220333
DWS         20492.684247
WS/48      -53402.003357
OBPM         3912.053469
DBPM         8941.789961
3P%        -11173.839339
2P%         -7132.271970
FT%        -16565.581224
Starters     3916.600399
dtype: float64


### Logistic Regression Classifier

In [70]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(tol=1e-4, max_iter=1000, random_state=0).fit(xctrain, yctrain)
clf.predict(xctest)
clf.predict_proba(xctest)
clf.score(xctest,yctest)

0.7542857142857143

### KNN Classifier

In [60]:
print(xctrain.shape, xctest.shape, yctrain.shape, yctest.shape)

(522, 22) (175, 22) (522,) (175,)


In [63]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

n_neighbors=10
neigh = KNeighborsClassifier(n_neighbors)
neigh.fit(xctrain, yctrain)

KNeighborsClassifier(n_neighbors=10)

In [66]:
print('Training AUC', metrics.roc_auc_score(yctrain,neigh.predict_proba(xctrain)[:,1]))
print('Validation AUC', metrics.roc_auc_score(yctest,neigh.predict_proba(xctest)[:,1]))

Training AUC 0.7432685115931189
Validation AUC 0.494957264957265


In [72]:
neigh.predict_proba(xctest)
neigh.score(xctest,yctest)

0.7428571428571429

### Ridge Regression

In [78]:
from sklearn.linear_model import Ridge

lm = Ridge(alpha=0.1, tol=1e-4, max_iter=1000, random_state=42)
lm.fit(xtrain, ytrain)
ypred = lm.predict(xtest)
print(np.sqrt(mean_squared_error(ytest, ypred)))
print(r2_score(ytest,ypred))

32203.47579928909
0.02852906994067561
