In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline

In [2]:
nba = pd.read_csv('all_seasons.csv')
nba = nba[['player_name', 'age', 'player_height', 'player_weight', 'pts', 'reb']]
nba = nba.rename(columns={'player_name':'Name', 'age':'Age', 'player_height' :'Height', 'player_weight':'Weight', 'pts': 'Avg Points', 'reb':'Avg Rebounds'})

In [3]:
nba['target'] = 0
nba['target'][(nba['Age'] <= 25) & (nba['Height'] >= 180) & (nba['Weight'] <= 90) & (nba['Avg Points'] >= 6) & (nba['Avg Rebounds'] >= 3)] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba['target'][(nba['Age'] <= 25) & (nba['Height'] >= 180) & (nba['Weight'] <= 90) & (nba['Avg Points'] >= 6) & (nba['Avg Rebounds'] >= 3)] = 1


In [4]:
nba[nba['target'] == 1]

Unnamed: 0,Name,Age,Height,Weight,Avg Points,Avg Rebounds,target
6,Eddie Jones,25.0,198.12,86.182480,17.2,4.1,1
147,Allen Iverson,22.0,182.88,74.842680,23.5,4.1,1
357,Kerry Kittles,23.0,195.58,81.192968,16.4,3.9,1
545,Kerry Kittles,24.0,195.58,81.192968,17.2,4.7,1
754,Allen Iverson,23.0,182.88,74.842680,22.0,3.7,1
...,...,...,...,...,...,...,...
10991,Terry Rozier,25.0,185.42,86.182480,17.6,4.4,1
11006,Trae Young,21.0,185.42,81.646560,29.4,4.3,1
11018,Kevin Huerter,21.0,200.66,86.182480,12.3,4.1,1
11044,Lonzo Ball,22.0,198.12,86.182480,12.3,6.2,1


In [46]:
scale = StandardScaler()
# scale.fit_transform(nba[['Height', 'Weight', 'Avg Points', 'Avg Rebounds']])

In [47]:
nba_scale = pd.DataFrame(scale.fit_transform(nba[['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']]),
                     columns=['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds'])
nba_scale

Unnamed: 0,Age,Height,Weight,Avg Points,Avg Rebounds
0,2.033006,-0.292998,-0.067402,-0.408829,5.025470
1,0.191372,1.641595,1.375352,-0.981681,-0.825572
2,2.723619,0.536113,-0.428090,-1.234410,-1.025951
3,-0.729445,0.259743,0.004736,-0.745801,-0.504967
4,1.572598,0.536113,0.653975,-0.964832,-0.464892
...,...,...,...,...,...
11140,0.191372,0.812484,0.653975,0.164023,0.737377
11141,-0.959649,-0.569369,-0.247746,-1.167015,-1.306480
11142,0.191372,1.365225,1.375352,-0.341434,0.617150
11143,-0.038832,0.812484,0.329355,-1.015378,-0.224438


In [48]:
xtr, xts, ytr, yts = train_test_split(nba_scale, nba['target'], train_size=0.82)
print(len(xtr), len(ytr))
print(len(xts), len(yts))

9138 9138
2007 2007


## Hyper Parameter Tuning

In [49]:
# paramater for LogisticRegression

penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter = [1, 10, 100, 1000, 10000]

param1 = {
    'penalty': penalty, 'solver': solver, 'max_iter': max_iter
}
param1

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'max_iter': [1, 10, 100, 1000, 10000]}

In [50]:
# paramater for Decision Tree

criterion = ['gini', 'entropy']
splitter = ['best', 'random']

param2 = {
    'criterion': criterion, 'splitter': splitter
}
param2

{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}

In [51]:
# parameter for KNN

n_neighbors = [1,2,3,4,5,6,7,8,9,10]
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

param3 = {
    'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm
}

param3

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'weights': ['uniform', 'distance'],
 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

## Modelling

In [52]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = KNeighborsClassifier()

In [53]:
modelgs1 = GridSearchCV(
    estimator = model1, 
    param_grid = param1,
    cv = 5)

modelgs2 = GridSearchCV(
    estimator = model2, 
    param_grid = param2,
    cv = 5)

modelgs3 = GridSearchCV(
    estimator = model3, 
    param_grid = param3,
    cv = 5)

In [54]:
modelgs1.fit(xtr, ytr)
modelgs2.fit(xtr, ytr)
modelgs3.fit(xtr, ytr)

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ST

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [55]:
print(modelgs1.best_params_)
print(modelgs2.best_params_)
print(modelgs3.best_params_)

{'max_iter': 10, 'penalty': 'none', 'solver': 'lbfgs'}
{'criterion': 'gini', 'splitter': 'best'}
{'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}


In [56]:
# modelfix1 = LogisticRegression(max_iter= 10, penalty= 'l2', solver= 'liblinear')
# modelfix2 = DecisionTreeClassifier(criterion= 'gini', splitter= 'best')
# modelfix3 = KNeighborsClassifier(algorithm= 'auto', n_neighbors= 9, weights= 'distance')

model_1_bestparam = modelgs1.best_params_
model_2_bestparam = modelgs2.best_params_
model_3_bestparam = modelgs3.best_params_
modelfix1 = LogisticRegression(**model_1_bestparam)
modelfix2 = DecisionTreeClassifier(**model_2_bestparam)
modelfix3 = KNeighborsClassifier(**model_3_bestparam)

In [57]:
modelfix1.fit(xtr, ytr)
modelfix2.fit(xtr, ytr)
modelfix3.fit(xtr, ytr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='distance')

## Evaluatian Metrics

#### Balanced Accuracy

In [58]:
print('Balanced Accuracy Logistic Regression: ' ,balanced_accuracy_score(yts, modelfix1.predict(xts)))
print('Balanced Accuracy Decision Tree: ',balanced_accuracy_score(yts, modelfix2.predict(xts)))
print('Balanced Accuracy KNN: ', balanced_accuracy_score(yts, modelfix3.predict(xts)))

Balanced Accuracy Logistic Regression:  0.6270501303969864
Balanced Accuracy Decision Tree:  1.0
Balanced Accuracy KNN:  0.8135250651984931


#### Precision

In [59]:
print('Precision Logistic Regression: ' ,precision_score(yts, modelfix1.predict(xts)))
print('Precision Accuracy Decision Tree: ',precision_score(yts, modelfix2.predict(xts)))
print('Precision Accuracy KNN: ', precision_score(yts, modelfix3.predict(xts)))

Precision Logistic Regression:  0.6
Precision Accuracy Decision Tree:  1.0
Precision Accuracy KNN:  0.88


#### Recall

In [60]:
print('Recall Logistic Regression: ' ,recall_score(yts, modelfix1.predict(xts)))
print('Recall Accuracy Decision Tree: ',recall_score(yts, modelfix2.predict(xts)))
print('Recall Accuracy KNN: ', recall_score(yts, modelfix3.predict(xts)))

Recall Logistic Regression:  0.2571428571428571
Recall Accuracy Decision Tree:  1.0
Recall Accuracy KNN:  0.6285714285714286


#### F1 Score

In [61]:
print('F1 Logistic Regression: ' ,f1_score(yts, modelfix1.predict(xts)))
print('F1 Accuracy Decision Tree: ',f1_score(yts, modelfix2.predict(xts)))
print('F1 Accuracy KNN: ', f1_score(yts, modelfix3.predict(xts)))

F1 Logistic Regression:  0.36
F1 Accuracy Decision Tree:  1.0
F1 Accuracy KNN:  0.7333333333333334


#### ROCAUC

In [62]:
print('ROCAUC Logistic Regression: ' ,roc_auc_score(yts, modelfix1.predict(xts)))
print('ROCAUC Accuracy Decision Tree: ',roc_auc_score(yts, modelfix2.predict(xts)))
print('ROCAUC Accuracy KNN: ', roc_auc_score(yts, modelfix3.predict(xts)))

ROCAUC Logistic Regression:  0.6270501303969864
ROCAUC Accuracy Decision Tree:  1.0
ROCAUC Accuracy KNN:  0.8135250651984932


## Model terbaik adalah Decision Tree

In [63]:
indo = pd.read_excel('basket_ina.xlsx')
indo

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5


In [64]:
# model_indo = make_pipeline(
#             StandardScaler(),
#             DecisionTreeClassifier(**model_2_bestparam))
# model_indo.fit(xtr, ytr)

In [69]:
indo_scale = pd.DataFrame(scale.fit_transform(indo[['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']]),
                                             columns=['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds'])
indo_scale

Unnamed: 0,Age,Height,Weight,Avg Points,Avg Rebounds
0,0.635999,1.329114,1.316322,0.142857,0.483494
1,-0.953998,0.257248,0.4239,-0.214286,-0.725241
2,0.106,-1.243365,-0.245416,1.214286,-0.725241
3,1.695997,0.257248,-2.030259,1.571429,-0.322329
4,-1.483997,-0.171499,-0.914732,-0.571429,-1.128152
5,-0.953998,1.543487,0.200795,-0.928571,2.09514
6,1.165998,0.900368,1.316322,0.142857,0.080582
7,0.106,-1.028992,0.647006,-2.0,-1.128152
8,-0.953998,-1.457738,-0.914732,0.857143,1.289317
9,0.635999,-0.385872,0.200795,-0.214286,0.080582


In [70]:
indo['target'] = modelfix2.predict(indo_scale[['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']])
indo['target'] = indo['target'].apply(lambda x: 'Diterima' if x == 1 else 'Tidak Diterima')
indo

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds,target
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,Tidak Diterima
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,Tidak Diterima
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,Tidak Diterima
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,Tidak Diterima
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,Tidak Diterima
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,Tidak Diterima
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,Tidak Diterima
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,Tidak Diterima
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,Diterima
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,Tidak Diterima
