## Import package

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

## Load dataset: Train

In [2]:
train = pd.read_csv("../../../../data/train/train.csv")
test = pd.read_csv("../../../../data/test/test.csv")

In [3]:
print(train.shape)
print(test.shape)

(9557, 143)
(23856, 142)


In [4]:
train.head(5)

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [5]:
test.head(5)

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,,0,5,0,1,1,0,,1,...,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,,0,5,0,1,1,0,,1,...,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,,0,5,0,1,1,0,,1,...,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,,0,14,0,1,1,1,1.0,0,...,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,18,121,324,1,0,1,0.25,64.0,,324


In [6]:
f = open("../../resources/continuousFeatures", "r")
continuous_columns = f.read().split(",")
f.close()

f = open("../../resources/categoricalFeatures", "r")
categorical_columns = f.read().split(",")
f.close()

print("Number of numerical columns: {0}".format(len(continuous_columns)))
print("Number of categorical columns: {0}".format(len(categorical_columns)))

Number of numerical columns: 38
Number of categorical columns: 102


In [7]:
y_train = train["Target"]
print(y_train.shape)

(9557,)


In [8]:
X_train = train[categorical_columns]
X_test = test[categorical_columns]
print(X_train.shape)
print(X_test.shape)

(9557, 102)
(23856, 102)


## Classification: KNeighborsClassifier

In [9]:
nearest_neighbors = KNeighborsClassifier()

In [10]:
model = nearest_neighbors.fit(X_train, y_train)

In [20]:
prediction_train = model.predict(X_train)
prediction_test = model.predict(X_test)

In [21]:
f1_score(y_train, prediction_train, average=None)

array([ 0.77424023,  0.76852459,  0.70423847,  0.90993219])

In [22]:
accuracy_score(y_train, prediction_train)

0.85445223396463321

In [23]:
unique_elements, counts_elements = np.unique(prediction_train, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 627 1453  938 6539]


In [24]:
unique_elements, counts_elements = np.unique(prediction_test, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 1396  3239  1945 17276]


In [25]:
print(type(prediction_test))
print(type(test["Id"]))

d = {"Id": test["Id"], "Target": prediction_test}
submission = pd.DataFrame(d)
submission.head()

submission.to_csv("../../../../submission/sklearn/kNearestNeighbors/01.csv", index=False)

<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


## Hyper-parameters tuning

In [39]:
# param_grid = [{"n_neighbors": [5, 10, 15, 20]}]

param_grid = [{"n_neighbors": [3, 5, 7, 9],
             "weights": ["uniform", "distance"],
             "algorithm": ["ball_tree", "kd_tree", "brute"],
             "leaf_size": [10, 20, 30],
             "p": [1, 2]
             }]

In [40]:
grid = GridSearchCV(nearest_neighbors, cv=3, param_grid=param_grid, verbose=1)

In [41]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed: 56.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'leaf_size': [10, 20, 30], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [42]:
grid.best_params_

{'algorithm': 'ball_tree',
 'leaf_size': 20,
 'n_neighbors': 9,
 'p': 2,
 'weights': 'uniform'}

In [43]:
grid.best_index_

30

In [44]:
grid.best_score_

0.57486658993407969

In [45]:
grid.cv_results_

{'mean_fit_time': array([ 0.08875338,  0.08577339,  0.10611677,  0.09775933,  0.10907729,
         0.13958526,  0.10730131,  0.09312924,  0.12416569,  0.09745391,
         0.07978439,  0.12233067,  0.09574413,  0.11512065,  0.11436335,
         0.0996627 ,  0.08746831,  0.08421032,  0.10267091,  0.09105237,
         0.12552905,  0.10098569,  0.10506066,  0.09472863,  0.11198306,
         0.10858639,  0.08229272,  0.11188563,  0.09853601,  0.10738866,
         0.10597261,  0.09893274,  0.09339801,  0.08620334,  0.09791096,
         0.10310229,  0.09418392,  0.11057766,  0.08380032,  0.07695897,
         0.08960764,  0.08943502,  0.10672299,  0.0819037 ,  0.08261601,
         0.08882904,  0.10873501,  0.09175237,  0.09552964,  0.10066422,
         0.11202399,  0.12128496,  0.09210936,  0.11010567,  0.115254  ,
         0.09397801,  0.07268572,  0.07606705,  0.07266633,  0.08402133,
         0.07401864,  0.07332261,  0.0702606 ,  0.08653736,  0.08127666,
         0.08575638,  0.07175104, 

In [46]:
prediction_train = grid.predict(X_train)

In [47]:
unique_elements, counts_elements = np.unique(prediction_train, return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[ 510 1190  641 7216]


In [48]:
unique_elements, counts_elements = np.unique(grid.predict(X_test), return_counts=True)
print(unique_elements)
print(counts_elements)

[1 2 3 4]
[  989  2476  1174 19217]


In [49]:
prediction_test = grid.predict(X_test)

In [50]:
print(type(prediction_test))
print(type(test["Id"]))

d = {"Id": test["Id"], "Target": prediction_test}
submission = pd.DataFrame(d)
submission.head()

submission.to_csv("../../../../submission/sklearn/kNearestNeighbors/02.csv", index=False)

<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
