In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [2]:
X_test = np.loadtxt("data/data01/X_test.csv", delimiter=",", dtype=float)
X_train = np.loadtxt("data/data01/X_train.csv", delimiter=",", dtype=float)
y_test = np.loadtxt("data/data01/y_test.csv", delimiter=",", dtype=float)
Y_train = np.loadtxt("data/data01/y_train.csv", delimiter=",", dtype=float)

In [3]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 42)
print(np.shape(x_train), np.shape(x_val), np.shape(y_train), np.shape(y_val))

(26979, 16) (6745, 16) (26979,) (6745,)


In [4]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
scaler.fit(x_val)
x_val = scaler.transform(x_val)
scaler.fit(X_test)
X_test = scaler.transform(X_test)

# KNN

In [5]:
warnings.filterwarnings('ignore') 
k_range = range(1, 100, 10)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_val, y_val, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())
print('Score by k :', k_scores[:5])

for i in range(len(k_scores)):
    if k_scores[i] == max(k_scores):
        best_k = i
        print('Index :', i,'\nScore :', max(k_scores), '\nBest k: ', best_k)
warnings.filterwarnings('default')

Score by k : [0.8029651593773165, 0.8345441067457375, 0.8367679762787249, 0.8391401037805781, 0.8391401037805781]
Index : 3 
Score : 0.8391401037805781 
Best k:  3
Index : 4 
Score : 0.8391401037805781 
Best k:  4


In [6]:
warnings.filterwarnings('ignore') 
knn_model_train = KNeighborsClassifier(n_neighbors=best_k).fit(x_train, y_train) 
y_pred_val = knn_model_train.predict(x_val)
y_pred_test = knn_model_train.predict(X_test)
warnings.filterwarnings('default')

In [7]:
print("KNN 0-1 loss on validation:",metrics.zero_one_loss(y_val, y_pred_val))
print("KNN 0-1 loss on validation:",metrics.zero_one_loss(y_test, y_pred_test))

KNN 0-1 loss on validation: 0.17361008154188284
KNN 0-1 loss on validation: 0.12790446730451044


# Random Forest

In [8]:
forest_model_train_50 = RandomForestClassifier(max_depth = 50, random_state=42)
forest_model_train_100 = RandomForestClassifier(max_depth = 100, random_state=42)
forest_model_train_200 = RandomForestClassifier(max_depth = 200, random_state=42)
forest_model_train_50.fit(x_train, y_train) 
forest_model_train_100.fit(x_train, y_train) 
forest_model_train_200.fit(x_train, y_train) 
y_pred_val_50 = forest_model_train_50.predict(x_val) 
y_pred_val_100 = forest_model_train_100.predict(x_val) 
y_pred_val_200 = forest_model_train_200.predict(x_val) 
y_pred_test_50 = forest_model_train_50.predict(X_test) 
y_pred_test_100 = forest_model_train_100.predict(X_test) 
y_pred_test_200 = forest_model_train_200.predict(X_test) 

In [9]:
print(forest_model_train_50.get_params())
print(forest_model_train_100.get_params())
print(forest_model_train_200.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 50, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 100, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 200, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 

In [10]:
print("Forest: 50 trees on val   :",metrics.zero_one_loss(y_val, y_pred_val_50))
print("Forest: 100 trees on val  :",metrics.zero_one_loss(y_val, y_pred_val_100))
print("Forest: 200 trees on val  :",metrics.zero_one_loss(y_val, y_pred_val_200))
print("Forest: 50 trees on train :",metrics.zero_one_loss(y_test, y_pred_test_50))
print("Forest: 100 trees on train:",metrics.zero_one_loss(y_test, y_pred_test_100))
print("Forest: 200 trees on train:",metrics.zero_one_loss(y_test, y_pred_test_200))

Forest: 50 trees on val   : 0.15374351371386208
Forest: 100 trees on val  : 0.15374351371386208
Forest: 200 trees on val  : 0.15374351371386208
Forest: 50 trees on train : 0.11056758506582265
Forest: 100 trees on train: 0.11056758506582265
Forest: 200 trees on train: 0.11056758506582265
