## Import package

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

## Load dataset: Train

In [2]:
train = pd.read_csv("../../../../data/train/train.csv")
test = pd.read_csv("../../../../data/test/test.csv")

In [3]:
print(train.shape)
print(test.shape)

(9557, 143)
(23856, 142)


In [4]:
train.head(5)

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [5]:
test.head(5)

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,,0,5,0,1,1,0,,1,...,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,,0,5,0,1,1,0,,1,...,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,,0,5,0,1,1,0,,1,...,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,,0,14,0,1,1,1,1.0,0,...,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,18,121,324,1,0,1,0.25,64.0,,324


In [6]:
f = open("../../resources/continuousFeatures", "r")
continuous_columns = f.read().split(",")
f.close()

f = open("../../resources/categoricalFeatures", "r")
categorical_columns = f.read().split(",")
f.close()

print("Number of numerical columns: {0}".format(len(continuous_columns)))
print("Number of categorical columns: {0}".format(len(categorical_columns)))

Number of numerical columns: 38
Number of categorical columns: 102


In [7]:
y_train = train["Target"]
print(y_train.shape)

(9557,)


In [8]:
X_train = train[categorical_columns]
X_test = test[categorical_columns]
print(X_train.shape)
print(X_test.shape)

(9557, 102)
(23856, 102)


## Classification: KNeighborsClassifier

In [9]:
nearest_neighbors = KNeighborsClassifier()

In [10]:
model = nearest_neighbors.fit(X_train, y_train)

In [20]:
prediction_train = model.predict(X_train)
prediction_test = model.predict(X_test)

In [21]:
f1_score(y_train, prediction_train, average=None)  

array([ 0.77424023,  0.76852459,  0.70423847,  0.90993219])

In [22]:
accuracy_score(y_train, prediction_train)

0.85445223396463321

In [23]:
unique_elements, counts_elements = np.unique(prediction_train, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 627 1453  938 6539]


In [24]:
unique_elements, counts_elements = np.unique(prediction_test, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 1396  3239  1945 17276]


In [25]:
print(type(prediction_test))
print(type(test["Id"]))

d = {"Id": test["Id"], "Target": prediction_test}
submission = pd.DataFrame(d)
submission.head()

submission.to_csv("../../../../submission/sklearn/kNearestNeighbors/01.csv", index=False)

<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


## Hyper-parameters tuning

In [26]:
# param_grid = [{"n_neighbors": [5, 10, 15, 20]}]

param_grid = [{"n_neighbors": [5, 10, 15, 20]},]

In [27]:
grid = GridSearchCV(nearest_neighbors, cv=3, param_grid=param_grid, verbose=1)

In [28]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  2.4min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [5, 10, 15, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [29]:
grid.best_params_

{'n_neighbors': 20}

In [30]:
grid.best_index_

3

In [31]:
grid.best_score_

0.59275923406926856

In [32]:
grid.cv_results_



{'mean_fit_time': array([ 0.09880233,  0.11487357,  0.09499566,  0.08235804]),
 'mean_score_time': array([ 3.44763128,  4.51167536,  4.79359802,  3.65270472]),
 'mean_test_score': array([ 0.53531443,  0.56900701,  0.58543476,  0.59275923]),
 'mean_train_score': array([ 0.86308623,  0.80946022,  0.76948852,  0.73783651]),
 'param_n_neighbors': masked_array(data = [5 10 15 20],
              mask = [False False False False],
        fill_value = ?),
 'params': [{'n_neighbors': 5},
  {'n_neighbors': 10},
  {'n_neighbors': 15},
  {'n_neighbors': 20}],
 'rank_test_score': array([4, 3, 2, 1], dtype=int32),
 'split0_test_score': array([ 0.63445246,  0.62974584,  0.62974584,  0.63068717]),
 'split0_train_score': array([ 0.87394035,  0.81522763,  0.76985871,  0.74285714]),
 'split1_test_score': array([ 0.53672316,  0.58631513,  0.6007533 ,  0.61268048]),
 'split1_train_score': array([ 0.86187412,  0.81211741,  0.77067964,  0.73379375]),
 'split2_test_score': array([ 0.43467337,  0.49089196,  0.

In [33]:
prediction_train = grid.predict(X_train)

In [34]:
unique_elements, counts_elements = np.unique(prediction_train, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 257  942  346 8012]


In [35]:
unique_elements, counts_elements = np.unique(grid.predict(X_test), return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[  497  1993   572 20794]


In [37]:
prediction_test = grid.predict(X_test)

In [38]:
print(type(prediction_test))
print(type(test["Id"]))

d = {"Id": test["Id"], "Target": prediction_test}
submission = pd.DataFrame(d)
submission.head()

submission.to_csv("../../../../submission/sklearn/kNearestNeighbors/02.csv", index=False)

<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
