# Loading The Dataset

In [74]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/sample_data/kr-vs-kp.data')

In [75]:
df['won'].value_counts()

won      1668
nowin    1527
Name: won, dtype: int64

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3195 entries, 0 to 3194
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   f       3195 non-null   object
 1   f.1     3195 non-null   object
 2   f.2     3195 non-null   object
 3   f.3     3195 non-null   object
 4   f.4     3195 non-null   object
 5   f.5     3195 non-null   object
 6   f.6     3195 non-null   object
 7   f.7     3195 non-null   object
 8   f.8     3195 non-null   object
 9   f.9     3195 non-null   object
 10  f.10    3195 non-null   object
 11  f.11    3195 non-null   object
 12  l       3195 non-null   object
 13  f.12    3195 non-null   object
 14  n       3195 non-null   object
 15  f.13    3195 non-null   object
 16  f.14    3195 non-null   object
 17  t       3195 non-null   object
 18  f.15    3195 non-null   object
 19  f.16    3195 non-null   object
 20  f.17    3195 non-null   object
 21  f.18    3195 non-null   object
 22  f.19    3195 non-null   

# Dataset Pre-processing

In [77]:
for column in df:
  pd.get_dummies(df[column], drop_first=True)
  df = pd.concat([pd.get_dummies(df[column], prefix=column, drop_first=True), df], axis=1)
  df.drop([column], axis=1, inplace=True)

In [78]:
X = df.iloc[:, 1:38]
y = df.iloc[:, 0]

# Training Classifiers

In [79]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV 

param_grid = {'n_neighbors': [1, 2, 3, 4, 5]} 

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5) 
knn_cv.fit(X, y)

print(knn_cv.best_params_)
print(knn_cv.best_score_)

{'n_neighbors': 5}
0.7805946791862285


In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 

param_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} 

lr = LogisticRegression()
lr_cv = GridSearchCV(lr, param_grid, cv=5) 
lr_cv.fit(X, y)

print(lr_cv.best_params_)
print(lr_cv.best_score_)

{'solver': 'liblinear'}
0.9327073552425664


In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 

param_grid = {'n_estimators': [1, 2, 3, 4, 5], 
              'max_depth': [2, 4, 6, 8, 10]}

rfc = RandomForestClassifier()
rfc_cv = GridSearchCV(rfc, param_grid, cv=5) 
rfc_cv.fit(X, y)

print(rfc_cv.best_params_)
print(rfc_cv.best_score_)

{'max_depth': 10, 'n_estimators': 4}
0.9051643192488262


In [82]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV 

param_grid = {
    'C': [1, 10, 100, 1000],
    'gamma': [0.0001, 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear', 'sigmoid']} 

svc = svm.SVC()
svc_cv = GridSearchCV(svc, param_grid, cv=5) 
svc_cv.fit(X, y)

print(svc_cv.best_params_)
print(svc_cv.best_score_)

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.9762128325508608


# Running Best Classifier

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [84]:
from sklearn import svm

svc = svm.SVC(C=128, gamma=0.03125, kernel='rbf')
svc.fit(X_train, y_train)
y_hat = svc.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_hat)

0.9947862356621481