In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [20]:
train_df = pd.read_csv('basketball_train.csv')
test_df = pd.read_csv('basketball_test.csv')
train_df

Unnamed: 0,Player,Pos,3P,TRB,BLK
0,Denzel Valentine,SG,1.3,2.6,0.1
1,Kyle Korver,SG,2.4,2.8,0.3
2,Troy Daniels,SG,2.1,1.5,0.1
3,Tim Hardaway,SG,1.9,2.8,0.2
4,Dewayne Dedmon,C,0.0,6.5,0.8
...,...,...,...,...,...
75,Victor Oladipo,SG,1.9,4.3,0.3
76,Willie Cauley-Stein,C,0.0,4.5,0.6
77,Brook Lopez,C,1.8,5.4,1.7
78,Josh Richardson,SG,1.4,3.2,0.7


In [21]:
train_df['target'] = train_df['Pos'].apply(lambda x: '1' if x == 'SG' else '2' )

In [22]:
train_df

Unnamed: 0,Player,Pos,3P,TRB,BLK,target
0,Denzel Valentine,SG,1.3,2.6,0.1,1
1,Kyle Korver,SG,2.4,2.8,0.3,1
2,Troy Daniels,SG,2.1,1.5,0.1,1
3,Tim Hardaway,SG,1.9,2.8,0.2,1
4,Dewayne Dedmon,C,0.0,6.5,0.8,2
...,...,...,...,...,...,...
75,Victor Oladipo,SG,1.9,4.3,0.3,1
76,Willie Cauley-Stein,C,0.0,4.5,0.6,2
77,Brook Lopez,C,1.8,5.4,1.7,2
78,Josh Richardson,SG,1.4,3.2,0.7,1


In [23]:
train_df['target']=train_df['target'].astype('int')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  80 non-null     object 
 1   Pos     80 non-null     object 
 2   3P      80 non-null     float64
 3   TRB     80 non-null     float64
 4   BLK     80 non-null     float64
 5   target  80 non-null     int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 3.9+ KB


In [24]:
test_df['target'] = test_df['Pos'].apply(lambda x: '1' if x == 'SG' else '2' )
test_df

Unnamed: 0,Player,Pos,3P,TRB,BLK,target
0,JaVale McGee,C,0.0,3.2,0.9,2
1,Manu Ginobili,SG,1.3,2.3,0.2,1
2,Nene Hilario,C,0.0,4.2,0.6,2
3,Evan Fournier,SG,1.9,3.1,0.1,1
4,Georgios Papagiannis,C,0.0,3.9,0.8,2
5,Anthony Davis,C,0.5,11.8,2.2,2
6,Tarik Black,C,0.0,5.1,0.7,2
7,Jamal Crawford,SG,1.4,1.6,0.2,1
8,Jordan Clarkson,SG,1.4,3.0,0.1,1
9,Marcin Gortat,C,0.0,10.4,0.7,2


In [25]:
test_df['target']=test_df['target'].astype('int')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  20 non-null     object 
 1   Pos     20 non-null     object 
 2   3P      20 non-null     float64
 3   TRB     20 non-null     float64
 4   BLK     20 non-null     float64
 5   target  20 non-null     int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 1.1+ KB


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train = train_df.loc[:,'3P':'BLK']
X_test = test_df.loc[:,'3P':'BLK']
y_train = train_df['target']
y_test = test_df['target']

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [29]:
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()
knn5 = KNeighborsClassifier(n_neighbors=5)
knn10 = KNeighborsClassifier(n_neighbors=10)
svc_linear= SVC(kernel='linear')
svc_rbf = SVC(kernel='rbf')

In [30]:
clf_list = [dt_clf, rf_clf, lr_clf, knn5, knn10, svc_linear,svc_rbf]

In [31]:
from sklearn.model_selection import cross_val_score

In [32]:
def callKFold_all(clf_list):
    for clf in clf_list:
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        for iter_count, accuracy in enumerate(scores):
            print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))

        print("평균 정확도: {0:.4f}".format(np.mean(scores)))

In [33]:
final_result = callKFold_all(clf_list)
final_result

교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.9375
교차 검증 2 정확도: 1.0000
교차 검증 3 정확도: 0.8750
교차 검증 4 정확도: 0.9375
평균 정확도: 0.9375
교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.9375
교차 검증 2 정확도: 1.0000
교차 검증 3 정확도: 0.8750
교차 검증 4 정확도: 1.0000
평균 정확도: 0.9500
교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.8750
교차 검증 2 정확도: 0.9375
교차 검증 3 정확도: 0.8750
교차 검증 4 정확도: 1.0000
평균 정확도: 0.9250
교차 검증 0 정확도: 0.8750
교차 검증 1 정확도: 0.8750
교차 검증 2 정확도: 0.8125
교차 검증 3 정확도: 0.8750
교차 검증 4 정확도: 0.8750
평균 정확도: 0.8625
교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.8125
교차 검증 2 정확도: 0.8750
교차 검증 3 정확도: 0.8125
교차 검증 4 정확도: 0.9375
평균 정확도: 0.8750
교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.8750
교차 검증 2 정확도: 1.0000
교차 검증 3 정확도: 0.9375
교차 검증 4 정확도: 1.0000
평균 정확도: 0.9500
교차 검증 0 정확도: 0.9375
교차 검증 1 정확도: 0.8750
교차 검증 2 정확도: 0.9375
교차 검증 3 정확도: 0.8750
교차 검증 4 정확도: 0.9375
평균 정확도: 0.9125


In [34]:
from sklearn.model_selection import GridSearchCV

## dt_clf

In [35]:
parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

In [36]:
grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [37]:
best_dclf = grid_dclf.best_estimator_
best_dclf

DecisionTreeClassifier(max_depth=3, random_state=11)

In [38]:
best_params = grid_dclf.best_params_
best_params

{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [39]:
best_score = grid_dclf.best_score_
best_score

0.9375

In [40]:
pred = best_dclf.predict(X_test)
pred

array([2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2])

In [41]:
accuracy = accuracy_score(y_test, pred)
accuracy

1.0

## rf_clf

In [42]:
rf_parameters = {'n_estimators':[5, 10, 15] ,'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

In [43]:
grid_rfclf = GridSearchCV(rf_clf, param_grid=rf_parameters, scoring='accuracy', cv=5)
grid_rfclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5],
                         'n_estimators': [5, 10, 15]},
             scoring='accuracy')

In [44]:
best_rfclf = grid_rfclf.best_estimator_
best_rfclf

RandomForestClassifier(max_depth=2, n_estimators=10, random_state=11)

In [45]:
best_params_rfclf = grid_rfclf.best_params_
best_params_rfclf

{'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [46]:
best_score_rfclf = grid_rfclf.best_score_
best_score_rfclf

0.95

In [47]:
pred_rfclf = best_rfclf.predict(X_test)
pred_rfclf

array([2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2])

In [48]:
accuracy_rfclf = accuracy_score(y_test, pred_rfclf)
accuracy_rfclf

1.0

### 분류의 결과와 정확도를 출력

In [49]:
dpredictions = best_rfclf.predict(X_test[['3P', 'TRB', 'BLK']])
accuracy = accuracy_score(y_test, dpredictions)
print('테스트 세트에서의 RF 정확도 : {0:.4f}'.format(accuracy))

테스트 세트에서의 RF 정확도 : 1.0000


In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_test, dpredictions))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

