In [1]:
!pip install fast_ml

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Preprocessed Data.csv')

# Model Comparison

## Splitting the data

In [3]:
from fast_ml.model_development import train_valid_test_split

df.drop(['ACID','datetime'],axis=1,inplace=True)
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target = 'FF', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)

## Scaling the data

In [4]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_trainsc=scaler.fit_transform(X_train)
X_validsc=scaler.fit_transform(X_valid)
X_testsc=scaler.fit_transform(X_test)

## Comparing Base Models

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
models={
    "Linear Regression":LinearRegression(),
    "Bayesian Regression":BayesianRidge(),
    "Lasso Regression":Lasso(),
    "Ridge Regression":Ridge(),
    "Decision Tree":DecisionTreeRegressor(max_depth=5),
    "Random Forest Regressor":RandomForestRegressor(n_estimators=5,max_depth=4),
    "Bagging Regressor":BaggingRegressor(n_estimators=5),
    "AdaBoost Regressor":AdaBoostRegressor(n_estimators=10),
    "XGBoost Regressor":XGBRegressor()
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_trainsc,y_train)
    
    #predictions for validation
    y_train_pred=model.predict(X_trainsc)
    y_valid_pred=model.predict(X_validsc)
    
    #Training set Performance
    model_train_mae=mean_absolute_error(y_train,y_train_pred)
    model_train_mse=mean_squared_error(y_train,y_train_pred)
    model_train_rmse=mean_squared_error(y_train,y_train_pred,squared=False)
    model_train_r2=r2_score(y_train,y_train_pred)
    model_train_adj_r2= 1 - (1-model_train_r2) * (len(y_train)-1)/(len(y_train)-X_trainsc.shape[1]-1)
    
    #Validation set Performance
    model_valid_mae=mean_absolute_error(y_valid,y_valid_pred)
    model_valid_mse=mean_squared_error(y_valid,y_valid_pred)
    model_valid_rmse=mean_squared_error(y_valid,y_valid_pred,squared=False)
    model_valid_r2=r2_score(y_valid,y_valid_pred)
    model_valid_adj_r2= 1 - (1-model_valid_r2) * (len(y_valid)-1)/(len(y_valid)-X_validsc.shape[1]-1)
    
    print(list(models.keys())[i])
    
    print("Model performance for Training Set")
    print("- MAE: {:.4f}".format(model_train_mae))
    print("- MSE: {:.4f}".format(model_train_mse))
    print("- RMSE: {:.4f}".format(model_train_rmse))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print("- Adjusted R2 Score: {:.4f}".format(model_train_adj_r2))
    
    
    
    print("-------------------------------------------------------")
    
    print("Model performance for Validation Set")
    print("- MAE: {:.4f}".format(model_valid_mae))
    print("- MSE: {:.4f}".format(model_valid_mse))
    print("- RMSE: {:.4f}".format(model_valid_rmse))
    print("- R2 Score: {:.4f}".format(model_valid_r2))
    print("- Adjusted R2 Score: {:.4f}".format(model_valid_adj_r2))
    
    print("="*35)
    print("\n")
    
    
    

Linear Regression
Model performance for Training Set
- MAE: 322.5278
- MSE: 224520.6441
- RMSE: 473.8361
- R2 Score: 0.9596
- Adjusted R2 Score: 0.9596
-------------------------------------------------------
Model performance for Validation Set
- MAE: 322.9155
- MSE: 226071.1442
- RMSE: 475.4694
- R2 Score: 0.9593
- Adjusted R2 Score: 0.9593


Bayesian Regression
Model performance for Training Set
- MAE: 322.5310
- MSE: 224520.8693
- RMSE: 473.8363
- R2 Score: 0.9596
- Adjusted R2 Score: 0.9596
-------------------------------------------------------
Model performance for Validation Set
- MAE: 322.9184
- MSE: 226069.8427
- RMSE: 475.4680
- R2 Score: 0.9593
- Adjusted R2 Score: 0.9593


Lasso Regression
Model performance for Training Set
- MAE: 330.8912
- MSE: 234214.8020
- RMSE: 483.9574
- R2 Score: 0.9578
- Adjusted R2 Score: 0.9578
-------------------------------------------------------
Model performance for Validation Set
- MAE: 331.2887
- MSE: 235835.1262
- RMSE: 485.6286
- R2 Score

## Hyper-parameter Tuning XGB

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import time

# A parameter grid for XGBoost
params = {
    'n_estimators':[500],
    'min_child_weight':[4,5], 
    'gamma':[i/10.0 for i in range(3,6)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)],
}

reg = XGBRegressor(nthread=-1)

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(reg, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring='neg_mean_squared_error')

start = time.time()
random_search.fit(X_trainsc, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))
print("-"*35)
print(random_search.best_params)

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.



In [None]:
best_regressor = random_search.best_estimator_

In [None]:
y_pred = best_regressor.predict(X_validsc)
print("Testing Score: {} ".format(best_regressor.score(X_validsc,y_valid)))

# Bayesian Optimization

In [21]:
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import r2_score

In [22]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [23]:
def objective(space):
    model=XGBRegressor(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_trainsc, y_train), ( X_validsc, y_valid)]
    
    model.fit(X_trainsc, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    

    y_valid_pred=model.predict(X_validsc)
    r2=r2_score(y_valid,y_valid_pred)
    print ("SCORE:", r2)
    return {'loss': -r2, 'status': STATUS_OK }

In [24]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                                                                 
0.9298066367511021                                                                                                     
SCORE:                                                                                                                 
0.9298474449072018                                                                                                     
SCORE:                                                                                                                 
0.9298345256621378                                                                                                     
SCORE:                                                                                                                 
0.9298332803435585                                                                                                     
SCORE:                                  

SCORE:                                                                                                                 
0.9332180805075732                                                                                                     
SCORE:                                                                                                                 
0.9296793887519369                                                                                                     
SCORE:                                                                                                                 
0.9312653810228232                                                                                                     
SCORE:                                                                                                                 
0.9298063770876462                                                                                                     
SCORE:                                  

SCORE:                                                                                                                 
0.9312918167160158                                                                                                     
SCORE:                                                                                                                 
0.9298531700009636                                                                                                     
SCORE:                                                                                                                 
0.9309750432894403                                                                                                     
SCORE:                                                                                                                 
0.9296796288111849                                                                                                     
SCORE:                                  

In [25]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.7526972112080851, 'gamma': 4.534510072655023, 'max_depth': 5.0, 'min_child_weight': 3.0, 'reg_alpha': 166.0, 'reg_lambda': 0.403240998637619}
