In [163]:
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_absolute_error,mean_squared_error, accuracy_score,f1_score

In [164]:
df=pd.read_csv(r'C:\Melbourne_Escooter\RealDataset\FinalResults\SA1\Features-trips\Final\final version\DATASET.csv')

In [165]:
column=['SA1_CODE21']
df=df.drop(column, axis=1)

In [166]:
df.columns

Index(['tripDensity', 'date', 'hour', 'rainfall', 'min_temp', 'trainDensity',
       'busDensity', 'tramDensity', 'Cafe perc', 'Office perc', 'Shops perc',
       'mxi', 'recreationCount', 'campusCount', 'female%', 'male%',
       'populationDensity', '5_14', '15_29', '30_39', '40_49'],
      dtype='object')

In [167]:
scale= StandardScaler()

In [168]:
df_sc= scale.fit_transform(df)
df_sc=pd.DataFrame(df_sc, columns=df.columns)

In [169]:
print(len(df_sc.columns))

21


In [170]:
y=df_sc['tripDensity']
X=df_sc.drop(['tripDensity'], axis=1)

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=24)

In [172]:
model = RandomForestRegressor(n_estimators=1000, criterion="squared_error", min_samples_leaf=6, min_samples_split=12, max_depth=30, max_features=5) 

In [173]:
LL_rf=model.fit(X_train, y_train)

In [174]:
score = LL_rf.score(X_train, y_train)
score

0.35600537868943605

In [175]:
ypred = LL_rf.predict(X_test)

In [176]:
test_score=LL_rf.score(X_test, y_test)
test_score

0.2174859743761559

In [177]:
mae =mean_absolute_error(y_test, ypred)
mae

0.3549509131071645

In [178]:
mse =mean_squared_error(y_test, ypred)
mse

0.6892496790879687

In [179]:
rmse=np.sqrt(mse)
rmse

0.8302106233287844

In [151]:
from sklearn.pipeline import Pipeline

In [152]:
feature_list=list(X.columns)
feature_importance = pd.Series(LL_rf.feature_importances_, index=feature_list).sort_values(ascending=False)
print(feature_importance)

hour                 0.236662
populationDensity    0.157496
date                 0.093217
min_temp             0.076007
Cafe perc            0.067623
tramDensity          0.057705
mxi                  0.052863
busDensity           0.048671
5_14                 0.044604
15_29                0.034890
male%                0.029805
female%              0.023279
Office perc          0.020270
30_39                0.019439
rainfall             0.015388
Shops perc           0.010819
40_49                0.009636
recreationCount      0.000870
trainDensity         0.000598
campusCount          0.000158
dtype: float64


In [109]:
### parameter tuning

In [110]:
from sklearn.model_selection import GridSearchCV

In [132]:
param_grid={
    'bootstrap': [True],
    'max_depth': [10, 20, 30],
    'max_features': [4, 5],
    'min_samples_leaf': [4, 5, 6],
    'min_samples_split': [10, 12, 14],
    'n_estimators': [ 300, 1000]
}

In [133]:
rf = RandomForestRegressor()

In [134]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)

In [135]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [10, 20, 30],
                         'max_features': [4, 5], 'min_samples_leaf': [4, 5, 6],
                         'min_samples_split': [10, 12, 14],
                         'n_estimators': [300, 1000]},
             verbose=2)

In [136]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 5,
 'min_samples_leaf': 6,
 'min_samples_split': 12,
 'n_estimators': 1000}

In [153]:
# cross val

In [154]:
test_score= cross_val_score(LL_rf, X_test, y_test, scoring='r2')

In [155]:
cv_r2=test_score.mean()


In [156]:
cv_r2

0.18519713655594622