In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [4]:
train_df = pd.read_csv("./inputs/train.csv")
target = train_df["rainfall"]
train_df.drop("day", inplace=True, axis=1)
train_df.drop("rainfall", inplace=True, axis=1)
train_df

Unnamed: 0,id,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8
...,...,...,...,...,...,...,...,...,...,...,...
2185,2185,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1
2186,2186,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3
2187,2187,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9
2188,2188,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0


In [5]:
scaler = StandardScaler()
scaler.fit(train_df)
scaled_data = scaler.transform(train_df)
scaled_data

array([[-1.7312601 ,  0.67170214, -0.91380916, ..., -0.72939738,
        -0.56090052, -0.4652908 ],
       [-1.72967832,  1.04311572, -1.79828913, ..., -1.03280391,
        -0.6859253 ,  0.00962944],
       [-1.72809653,  1.85668833, -1.23222195, ...,  1.25653632,
        -0.43587575, -0.37434863],
       ...,
       [ 1.72809653, -0.05343865, -1.30298035, ...,  0.34631671,
        -0.81095007,  1.12114491],
       [ 1.72967832,  1.53833383, -1.76290993, ..., -1.0052215 ,
        -0.81095007, -0.38445331],
       [ 1.7312601 ,  0.03499315, -0.91380916, ..., -0.75697979,
        -0.43587575,  2.6469525 ]])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, test_size=0.2, random_state=42)

In [13]:
clf = KNeighborsClassifier(n_neighbors=25, weights="uniform", leaf_size=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

0.7743078422591607


In [10]:
parameters = {
    "n_neighbors": [5, 10, 15, 20, 25],
    "weights": ["uniform", "distance"],
    "leaf_size": [1, 2, 3, 4, 5]
}
base_model = KNeighborsClassifier()
grid = GridSearchCV(base_model, parameters, scoring="roc_auc", n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
0.7743078422591607


In [11]:
print(grid.best_params_)
print(grid.best_score_)

{'leaf_size': 1, 'n_neighbors': 25, 'weights': 'uniform'}
0.8787154140189488
