## Notebook on Regression model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv(-------/train.csv')

In [5]:
x = df.drop(['ID_LAT_LON_YEAR_WEEK','emission'],axis = 1)
y = df[['emission']]

In [10]:
from sklearn.impute import SimpleImputer

null_cols_numeric = []

for i in x.columns:
    if df[i].isnull().sum()>0 and x[i].dtype != 'object':
        null_cols_numeric.append(i)


imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[null_cols_numeric] = pd.DataFrame(imputer.fit_transform(x[null_cols_numeric]))

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.3)

### Defining the hyperparameter grid for Grid Search Cross validation

In [7]:
param_grid_gbr = {
    'max_depth': range(1,6),
    'n_estimators': range(50,101,10),
    'max_features':['sqrt'],
    'random_state':[42],
    'warm_start': [True]
}

param_grid_rfr = {
    'n_estimators':[50,100,150],
    'max_depth': [None,10,20],
    'max_features':['sqrt']   
}

param_grid_lr = {
    'fit_intercept': [True, False],
    'copy_X':[True, False]
}

### Initialize GridSearchCV for each regressor

In [8]:
grid_gbr = GridSearchCV(
    GradientBoostingRegressor(),
    param_grid=param_grid_gbr,
    n_jobs = -1,
    cv = 2,
    verbose = 1)
#n_jobs = -1 for parallel processing
#cv is K fold of validation

grid_rfr = GridSearchCV(
    RandomForestRegressor(),
    param_grid=param_grid_rfr,
    n_jobs = -1,
    cv = 2,
    verbose = 1)

grid_lr = GridSearchCV(
    LinearRegression(),
    param_grid=param_grid_lr,
    n_jobs = -1,
    cv = 2,
    verbose = 1)


In [16]:
#fit model
grid_gbr.fit(x_train,y_train.values.ravel())
grid_rfr.fit(x_train,y_train.values.ravel())
grid_lr.fit(x_train,y_train.values.ravel())

Fitting 2 folds for each of 30 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)


Fitting 2 folds for each of 9 candidates, totalling 18 fits
Fitting 2 folds for each of 4 candidates, totalling 8 fits


In [17]:
#Finding best regressor with highest cross-validation score

best_regressor = None

if grid_gbr.best_score_ >= grid_rfr.best_score_ and grid_gbr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_gbr
elif grid_rfr.best_score_ >= grid_gbr.best_score_ and grid_rfr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_rfr
else:
    best_regressor = grid_lr
    

print('Best Regressor CV score: {:.4f}'.format(best_regressor.best_score_))
print('Best Regressor best params : \n{}'.format(best_regressor.best_params_))

Best Regressor CV score: 0.3761
Best Regressor best params : 
{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 42, 'warm_start': True}


In [18]:
#Finding best regressor with highest cross-validation score

best_regressor = None

if grid_gbr.best_score_ >= grid_rfr.best_score_ and grid_gbr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_gbr
elif grid_rfr.best_score_ >= grid_gbr.best_score_ and grid_rfr.best_score_ >= grid_lr.best_score_:
    best_regressor = grid_rfr
else:
    best_regressor = grid_lr
    

print('Best Regressor CV score: {:.4f}'.format(best_regressor.best_score_))
print('Best Regressor best params : \n{}'.format(best_regressor.best_params_))

best_predictions = best_regressor.predict(x_test)

print('Best regressor test score (R-Squared):{:.4f}'.format(best_regressor.score(x_test,y_test)))

Best Regressor CV score: 0.3761
Best Regressor best params : 
{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 42, 'warm_start': True}
Best regressor test score (R-Squared):0.3578


In [19]:
from sklearn.metrics import mean_squared_error, r2_score

best_predictions = best_regressor.predict(x_test)

mse = mean_squared_error(y_test, best_predictions)

r2 = r2_score(y_test, best_predictions)

print('Mean squared error (MSE):{:.4f}'.format(mse))
print('R-Squared (R2) Score:{:.4f}'.format(r2))

Mean squared error (MSE):11577.5279
R-Squared (R2) Score:0.3578
