In [38]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [9]:
train_df = pd.read_csv("./inputs/train.csv").set_index("id")
target = train_df["rainfall"]
train_df.drop("rainfall", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8
...,...,...,...,...,...,...,...,...,...,...,...
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0


In [10]:
# Scale the data
scaler = StandardScaler()
scaler.fit(train_df)
scaled_data = scaler.transform(train_df)
scaled_data

array([[-1.70136083,  0.67170214, -0.91380916, ..., -0.72939738,
        -0.56090052, -0.4652908 ],
       [-1.69185328,  1.04311572, -1.79828913, ..., -1.03280391,
        -0.6859253 ,  0.00962944],
       [-1.68234573,  1.85668833, -1.23222195, ...,  1.25653632,
        -0.43587575, -0.37434863],
       ...,
       [ 1.74037217, -0.05343865, -1.30298035, ...,  0.34631671,
        -0.81095007,  1.12114491],
       [ 1.74987972,  1.53833383, -1.76290993, ..., -1.0052215 ,
        -0.81095007, -0.38445331],
       [ 1.75938727,  0.03499315, -0.91380916, ..., -0.75697979,
        -0.43587575,  2.6469525 ]])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, test_size=0.2, random_state=42)

In [44]:
clf = DecisionTreeClassifier(max_depth=4, min_samples_split=5, min_samples_leaf=6)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

0.7802086351782092


In [41]:
parameter = {
    'criterion': ["gini", "entropy"],
    'max_depth': [1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
base_model = DecisionTreeClassifier()
grid = GridSearchCV(base_model, parameter, scoring="roc_auc", n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

Fitting 5 folds for each of 1980 candidates, totalling 9900 fits
0.7802086351782092


In [42]:
print(grid.best_score_)
print(grid.best_params_)

0.8794051162747772
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 6}
