In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

train=pd.read_csv('data/train.csv').drop('id',axis=1)
test=pd.read_csv('data/test.csv').drop('id',axis=1)
submission=pd.read_csv('data/submission.csv',index_col=0)

train_cpy=train.copy()
test_cpy=test.copy()
submission_cpy=submission.copy()

In [8]:
median=train_cpy.median()
train_fil=train_cpy.fillna(median,inplace=False)
train_fil.isnull().sum()

hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

In [9]:
bins=[0,500,1000,1500,2000]
labels=[i for i in range(4)]

train_fil['hour_bef_visibility']=\
    pd.cut(
        train_fil['hour_bef_visibility'],
        bins=bins,
        labels=labels,
        include_lowest=True)

train_fil.head()

Unnamed: 0,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,20,16.3,1.0,1.5,89.0,1,0.027,76.0,33.0,49.0
1,13,20.1,0.0,1.4,48.0,1,0.042,73.0,40.0,159.0
2,6,13.9,0.0,0.7,79.0,2,0.033,32.0,19.0,26.0
3,23,8.1,0.0,2.7,54.0,1,0.04,75.0,64.0,57.0
4,18,29.5,0.0,4.8,7.0,3,0.057,27.0,11.0,431.0


In [10]:
train_set,valid_set=\
    train_test_split(
        train_fil,
        test_size=.2,
        random_state=42)

In [11]:
X_train=train_set.drop(['count'],axis=1)
y_train=train_set['count']
X_valid=valid_set.drop(['count'],axis=1)
y_valid=valid_set['count']

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [13]:
def get_model_score(model,X_train,y_train,X_val,y_val):
    pred_tr_arr=cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
    pred_val_arr=cross_val_score(model,X_val,y_val,scoring='neg_mean_squared_error',cv=5)
    print('train set RMSE : {}'.format(np.round(np.sqrt(np.mean(-1*pred_tr_arr)),2)))
    print('valid set RMSE : {}'.format(np.round(np.sqrt(np.mean(-1*pred_val_arr)),2)))

In [14]:
rf=RandomForestRegressor()

In [15]:
get_model_score(rf,X_train,y_train,X_valid,y_valid)

train set RMSE : 39.25
valid set RMSE : 46.02


In [16]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators=[int(x) for x in np.linspace(start=100,stop=2000,num=10)]
max_depth=[int(x) for x in np.linspace(start=1,stop=100,num=10)]
max_depth.append(None)
min_samples_split=[2,5,10]
min_samples_leaf=[1,2,4]
bootstrap=[True,False]

random_grid={
    'n_estimators':n_estimators,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
}

In [17]:
rf=RandomForestRegressor()

rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,
cv=5,verbose=2,random_state=42,n_jobs=-1)
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [18]:
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 34,
 'bootstrap': True}

In [19]:
rf_best=RandomForestRegressor(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth=34,
    bootstrap=True)

get_model_score(rf_best,X_train,y_train,X_valid,y_valid)

train set RMSE : 39.51
valid set RMSE : 45.07


In [20]:
median=train_cpy.median()
test_fil=test_cpy.fillna(median,inplace=False)
test_fil.isnull().sum()

hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
dtype: int64

In [21]:
test_fil['hour_bef_visibility']=\
    pd.cut(
        test_fil['hour_bef_visibility'],
        bins=bins,
        labels=labels,
        include_lowest=True)

In [23]:
X_train_full=train_fil.drop('count',axis=1)
y_train_full=train_fil['count']

In [24]:
rf_best.fit(X_train_full,y_train_full)

In [25]:
pred=rf_best.predict(test_fil)

In [26]:
submission_cpy['count']=pred
submission_cpy

Unnamed: 0_level_0,count
id,Unnamed: 1_level_1
0,89.63
1,222.51
2,73.69
4,33.01
5,78.01
...,...
2148,64.03
2149,68.95
2165,124.34
2166,149.34


In [27]:
submission_cpy.to_csv('jan11sub.csv')