In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv('./inputs/train.csv').set_index('id')
target = train_df["rainfall"]
train_df.drop("day", inplace=True, axis=1)
train_df.drop("rainfall", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8
...,...,...,...,...,...,...,...,...,...,...
2185,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1
2186,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3
2187,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9
2188,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0


In [3]:
# Experiment: Which performs better between fitting the whole dataset or fitting X_train first then using that to transform y_train?
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train_df)
scaled_data

array([[ 0.67170214, -0.91380916, -0.64219876, ..., -0.72939738,
        -0.56090052, -0.4652908 ],
       [ 1.04311572, -1.79828913, -1.35084574, ..., -1.03280391,
        -0.6859253 ,  0.00962944],
       [ 1.85668833, -1.23222195, -1.50406671, ...,  1.25653632,
        -0.43587575, -0.37434863],
       ...,
       [-0.05343865, -1.30298035, -1.46576147, ...,  0.34631671,
        -0.81095007,  1.12114491],
       [ 1.53833383, -1.76290993, -1.6764403 , ..., -1.0052215 ,
        -0.81095007, -0.38445331],
       [ 0.03499315, -0.91380916, -0.92948808, ..., -0.75697979,
        -0.43587575,  2.6469525 ]])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, test_size=0.2, random_state=42)

In [13]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)

0.7926819630673585


In [6]:
# parameters = {
#     "n_estimators": [100, 1000, 10000],
#     "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#     "random_state": [21, 34, 42, 50]
# }
# base_model = RandomForestClassifier()
# grid = GridSearchCV(base_model, parameters, scoring="roc_auc", n_jobs=-1, verbose=3)
# grid.fit(X_train, y_train)
# y_pred = grid.predict(X_test)
# score = roc_auc_score(y_test, y_pred)
# print(score)

n_estimators_param = [100, 1000, 10000]
max_depth_param = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
random_state_param = [21, 34, 42, 50]
arr = []

for i in n_estimators_param:
    for j in max_depth_param:
        for k in random_state_param:
            clf = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=k)
            clf.fit(X_train,y_train)
            predictions = clf.predict(X_test)
            score = roc_auc_score(y_test,predictions)
            arr.append([i,j,k,score])

arr

[[100, 1, 1, 0.7075814651879562],
 [100, 1, 2, 0.7065145807539317],
 [100, 1, 3, 0.7033797845156873],
 [100, 1, 4, 0.7186191090856405],
 [100, 1, 5, 0.7049471826348095],
 [100, 1, 6, 0.7075814651879562],
 [100, 1, 7, 0.7065145807539317],
 [100, 1, 8, 0.6444903980400938],
 [100, 1, 9, 0.6581623244909249],
 [100, 1, 10, 0.6996786175285161],
 [100, 2, 1, 0.7464634756724007],
 [100, 2, 2, 0.7464634756724007],
 [100, 2, 3, 0.7506651563446696],
 [100, 2, 4, 0.7738073285740629],
 [100, 2, 5, 0.7685387634677695],
 [100, 2, 6, 0.7548668370169385],
 [100, 2, 7, 0.7548668370169385],
 [100, 2, 8, 0.7753747266931852],
 [100, 2, 9, 0.7448960775532785],
 [100, 2, 10, 0.7685387634677695],
 [100, 3, 1, 0.7685387634677695],
 [100, 3, 2, 0.7727404441400385],
 [100, 3, 3, 0.7811438054845763],
 [100, 3, 4, 0.7759410974421117],
 [100, 3, 5, 0.7727404441400385],
 [100, 3, 6, 0.779576407365454],
 [100, 3, 7, 0.7827112036036985],
 [100, 3, 8, 0.7764416111272094],
 [100, 3, 9, 0.780076921050552],
 [100, 3, 10, 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=3)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
score = roc_auc_score(y_test, y_pred)
print(score)

0.7926819630673585


In [12]:
for i in arr:
    if i[3] > 0.79:
        print(i)

[100, 7, 3, 0.7926819630673585]
[100, 8, 8, 0.7921814493822608]
[100, 9, 3, 0.7926819630673585]
