# Test of K-Nearest Neighbors classifier

In [10]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [11]:
# Load the dataset
dataset = pd.read_csv('datasets/new_train.csv', index_col=0)
X = dataset.drop('label',axis=1).values
y = dataset['label'].values

In [12]:
# Standardize features
sc = StandardScaler()
X = sc.fit_transform(X)

In [13]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [14]:
# Fit Random Forest Classification to the Training set
classifier = knn(n_neighbors=3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [15]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.7532907857997607

# Apply Grid Search to K-Nearest Neighbors

In [8]:
# Apply Grid Search to find the best model and the best parameters
parameters = {'n_neighbors': [2,3,4,5],'weights': ['distance'],}

grid_search = GridSearchCV(estimator=classifier,param_grid=parameters,scoring='accuracy',cv=4,n_jobs=-1)

grid_search = grid_search.fit(X_train, y_train)

print(grid_search.best_score_,grid_search.best_params_)

0.835477757829643 {'n_neighbors': 4, 'weights': 'distance'}


0.8815086411588178
{'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 10, 'n_estimators': 10}
0.8880415113760961
{'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 5, 'n_estimators': 50}
0.9199086779561565
{'max_depth': 10, 'max_features': 8, 'min_samples_leaf': 3, 'n_estimators': 30}
0.9405052175145385
{'max_depth': 20, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 40}
0.940654818586634
{'max_depth': 20, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 50}
0.9410038960425456
{'max_depth': 15, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 70}
0.9410038960425456
{'max_depth': 15, 'max_features': 8, 'min_samples_leaf': 1, 'n_estimators': 70}

We will save the trained model in a .txt file so that we don't have to rerun it each time we want to use its results.

In [9]:
import pickle

with open('D:/Utilisateurs/Bastien/Documents/Cours/CentraleSupelec/Electifs/Machine Learning/Evaluations/Assignment 2/mail-classification/knn_gridsearch.txt','wb') as fichier:
    pickler=pickle.Pickler(fichier)
    pickler.dump(grid_search)

# Prediction on Test set

In [10]:
# # Load the dataset
# dataset_train = pd.read_csv('datasets/new_train.csv', index_col=0)
# dataset_test = pd.read_csv('datasets/new_test.csv', index_col=0)
# X_train = dataset_train.drop('label',axis=1).values
# y_train = dataset_train['label'].values
# X_test = dataset_test.values

In [11]:
# # Standardize features
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [12]:
# # Fit Random Forest Classification to the Training set
# classifier = RandomForestClassifier(max_depth=15, max_features=8, min_samples_leaf=1, n_estimators=70, criterion='entropy', random_state=0)
# classifier.fit(X_train, y_train)

In [13]:
# # Predict the Test set results
# y_pred = classifier.predict(X_test)

In [14]:
# # Save results to submission file
# y_pred = pd.DataFrame(y_pred, columns=['label'])
# y_pred.to_csv("datasets/random_forest_submission.csv", index=True, index_label='Id')