In [87]:
import pandas as pd
import numpy as np 

from sklearn import preprocessing

### train, test data 

In [88]:
# train data 전처리
df = pd.read_csv('basket_train.csv')
df

Unnamed: 0,Player,Pos,3P,TRB,BLK
0,Denzel Valentine,SG,1.3,2.6,0.1
1,Kyle Korver,SG,2.4,2.8,0.3
2,Troy Daniels,SG,2.1,1.5,0.1
3,Tim Hardaway,SG,1.9,2.8,0.2
4,Dewayne Dedmon,C,0.0,6.5,0.8
...,...,...,...,...,...
75,Victor Oladipo,SG,1.9,4.3,0.3
76,Willie Cauley-Stein,C,0.0,4.5,0.6
77,Brook Lopez,C,1.8,5.4,1.7
78,Josh Richardson,SG,1.4,3.2,0.7


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  80 non-null     object 
 1   Pos     80 non-null     object 
 2   3P      80 non-null     float64
 3   TRB     80 non-null     float64
 4   BLK     80 non-null     float64
dtypes: float64(3), object(2)
memory usage: 3.2+ KB


In [90]:
df.drop('Player', axis=1, inplace=True)

In [91]:
df

Unnamed: 0,Pos,3P,TRB,BLK
0,SG,1.3,2.6,0.1
1,SG,2.4,2.8,0.3
2,SG,2.1,1.5,0.1
3,SG,1.9,2.8,0.2
4,C,0.0,6.5,0.8
...,...,...,...,...
75,SG,1.9,4.3,0.3
76,C,0.0,4.5,0.6
77,C,1.8,5.4,1.7
78,SG,1.4,3.2,0.7


In [92]:
from sklearn.preprocessing import LabelEncoder

In [93]:
data = pd.Series(['SG','C'])
encoder = LabelEncoder()
encoder.fit(data)

LabelEncoder()

In [94]:
df['Pos'] = encoder.transform(df['Pos'])

In [95]:
df

Unnamed: 0,Pos,3P,TRB,BLK
0,1,1.3,2.6,0.1
1,1,2.4,2.8,0.3
2,1,2.1,1.5,0.1
3,1,1.9,2.8,0.2
4,0,0.0,6.5,0.8
...,...,...,...,...
75,1,1.9,4.3,0.3
76,0,0.0,4.5,0.6
77,0,1.8,5.4,1.7
78,1,1.4,3.2,0.7


In [96]:
X_df = df.iloc[:,1:]
y_df = df.iloc[:,0]

In [97]:
# test data 전처리
df_test = pd.read_csv('basketball_test.csv')
df_test

from sklearn.preprocessing import LabelEncoder 

encoder = LabelEncoder()

df_test['Pos']=encoder.fit_transform(df_test['Pos'])

In [98]:
df_test.drop('Player', axis=1, inplace=True)

In [99]:
X_test = df_test.iloc[:,1:]
y_test = df_test.iloc[:,0]

### 머신러닝 

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


# 7가지 방법 
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()
svm_svc = SVC(kernel='rbf')
svm_svc2 = SVC(kernel = 'linear') 
knn1 = KNeighborsClassifier(n_neighbors = 5)
knn2 = KNeighborsClassifier(n_neighbors = 10)
knn3 = KNeighborsClassifier(n_neighbors = 30)
model = [rf_clf, lr_clf, svm_svc,svm_svc2, knn1, knn2, knn3]

In [101]:
# 방법별로 평균accuracy 찾기

from sklearn.model_selection import cross_val_score

score_list=[]

for i in model : 
    scores = cross_val_score(i, X_df, y_df, cv=5)
    score_mean = np.mean(scores)
    score_list.append(score_mean)
score_list

[0.95, 0.925, 0.9125, 0.95, 0.8625, 0.875, 0.8375]

In [102]:
np.argmax(score_list) #-> rf_clf / svm(linear)

0

### GridSearchCV

In [103]:
from sklearn.model_selection import GridSearchCV

In [104]:
# RandomForestClassifier를 GridSearchCV

parameters = {'max_depth':[2,3,5,10], 
              'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

grid_rf = GridSearchCV(rf_clf, param_grid=parameters, scoring = 'accuracy', cv=5)
grid_rf.fit(X_df, y_df)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [105]:
# 최적의 파라미터
grid_rf.best_params_

{'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [106]:
best_rf = grid_rf.best_estimator_
best_rf

RandomForestClassifier(max_depth=2, random_state=11)

In [107]:
best_score = grid_rf.best_score_
best_score

0.95

In [108]:
pred = best_rf.predict(X_test)
pred

array([0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [109]:
accuracy = accuracy_score(y_test, pred)
accuracy

1.0

In [110]:
# SVM을 GridSearchCV

def svc_param_selection(X, y, nfolds):
    svm_parameters = [
                        {'kernel': ['rbf','linear'],
                         'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1, 1],
                         'C': [0.01, 0.1, 1, 10, 100, 1000]
                        }
                       ]
    
    clf = GridSearchCV(SVC(), svm_parameters, cv=5)
    clf.fit(X_train, y_train.values.ravel())
    print(clf.best_params_)
    
    return clf

In [111]:
clf = svc_param_selection(X_df, y_df, 5)

{'C': 1, 'gamma': 1e-05, 'kernel': 'linear'}


In [112]:
pred2=clf.predict(X_test)

In [113]:
accuracy2 = accuracy_score(y_test, pred2)
accuracy2

1.0

### classification - confusion matrix(precision, recall, f1-score)

In [115]:
from sklearn.metrics import classification_report

In [116]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         9

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [117]:
# y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         9

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [118]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [119]:
from sklearn.metrics import f1_score, roc_auc_score

In [121]:
accuracy_score(y_test,pred)

1.0

In [122]:
precision_score(y_test,pred)

1.0

In [123]:
recall_score(y_test,pred)

1.0

In [124]:
f1_score(y_test,pred)

1.0

In [125]:
roc_auc_score(y_test,pred)

1.0