In [85]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [86]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [87]:
train_data = pd.read_csv("C:/Users/Görkem/DrivenData/train_values.csv", delimiter=",")
train_label = pd.read_csv("C:/Users/Görkem/DrivenData/train_labels.csv", delimiter=",")

In [77]:
train_data = train_data.sample(frac=.30)
indices = train_data.index
train_label = train_label.iloc[indices]

In [88]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_label, test_size=0.2, random_state=42)

In [89]:
y_fulltrain_labels = train_label['damage_grade']
y_train_labels = y_train['damage_grade']
y_valid_labels = y_valid['damage_grade']

In [91]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        encoded = pd.get_dummies(Xt)
        return encoded

In [92]:
class ColumnDrop(BaseEstimator, TransformerMixin):
    def __init__(self, items):
        self.items = items
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        dropped_data = Xt.drop(self.items, axis=1)
        return dropped_data

In [93]:
drop_features = [
'building_id',
'count_floors_pre_eq',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'land_surface_condition_n',
 'land_surface_condition_t',
 'foundation_type_u',
 'roof_type_n',
 'roof_type_q',
 'ground_floor_type_f',
 'ground_floor_type_m',
 'other_floor_type_s',
 'other_floor_type_x',
 'position_o',
 'position_s',
 'position_t',
 'plan_configuration_c',
 'plan_configuration_f',
 'plan_configuration_m',
 'plan_configuration_n',
 'plan_configuration_o',
 'plan_configuration_s']

In [94]:
column_drop = ColumnDrop(drop_features)
encoder = Encoder()
sca = StandardScaler()
clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=14)


In [95]:
X_train = encoder.transform(X_train)
X_valid = encoder.transform(X_valid)

X_train = column_drop.transform(X_train)
X_valid = column_drop.transform(X_valid)

In [96]:
pipe = Pipeline([("Scaler", sca), ("knn", clf)])

In [97]:
pipe.fit(X_train, y_train_labels)

Pipeline(memory=None,
         steps=[('Scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=-1, n_neighbors=14, p=2,
                                      weights='uniform'))],
         verbose=False)

In [98]:
X_valid.shape

(52121, 43)

In [99]:
y_pred = pipe.predict(X_valid)
score = f1_score(y_valid_labels, y_pred, average='micro')

In [100]:
print(score)

0.6607509449166363


In [84]:
nb_list = list(range(6, 50, 2))
pipelines = []
scores = []
for nb in nb_list:
    clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=nb)
    pipe = Pipeline([("Scaler", sca), ("knn", clf)])
    pipelines.append(pipe)
    
    pipe.fit(X_train, y_train_labels)
    y_pred = pipe.predict(X_valid)
    score = f1_score(y_valid_labels, y_pred, average='micro')
    scores.append(score)
    print(nb, " neighbors fitted with score :", score)

6  neighbors fitted with score : 0.6266308518802763
8  neighbors fitted with score : 0.6307879253005884
10  neighbors fitted with score : 0.6287413660782809
12  neighbors fitted with score : 0.6284215911997953
14  neighbors fitted with score : 0.6305321053977999
16  neighbors fitted with score : 0.6281018163213098
18  neighbors fitted with score : 0.6252238424149399
20  neighbors fitted with score : 0.6245203376822717
22  neighbors fitted with score : 0.6243284727551803
24  neighbors fitted with score : 0.621194678946022
26  neighbors fitted with score : 0.6229854182655411
28  neighbors fitted with score : 0.624712202609363
30  neighbors fitted with score : 0.6228575083141469
32  neighbors fitted with score : 0.6237528779739063
34  neighbors fitted with score : 0.6226656433870555
36  neighbors fitted with score : 0.6214504988488104
38  neighbors fitted with score : 0.6226016884113584
40  neighbors fitted with score : 0.6216423637759018
42  neighbors fitted with score : 0.62113072397032