In [13]:
# In logistic Regression, Hyper-parameter tuning is applied while calculating the z-score
# 
#   - Here, we also have 3 kinds of regularization :-
#       - L1 Regularization (Lasso)             {Feature Selectiom}
#       - L2 Regularization (Ridge)             {Prevents Overfitting}
#       - L1 & L2 Regularization (ElasticNet)   {Balance between L1&L2}

In [14]:
# Python setting hyperparameters manually in LogisticRegression Class
# 
# LogisticRegression (
#       "penalty" : {'l1', 'l2', 'elasticnet'}
#       "tol" : tolerance (stopping critera for error)
#       "C" : Inverse of Regularization Strength(λ) (Smaller values imply stronger Regularization)
#       "class_weight" : 'balanced' or dict{column_name: weight}
#       "solver" : {‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}
#       "max_iter" : maximum iterations allowed in gradient descent
#       "n_jobs" : maximum cores allowed to be used
# ) -> {
#       "coeff_" : coefficient array [β₀ β₁x₁ β₂x₂ ... βₙ]  {for each model in case of multi-class classification}
#       "intercept_" : constant term                        {for each model in case of multi-class classification}
# } 

In [15]:
import pandas as pd

df = pd.read_csv('./datasets/algerian-forest-fires-cleaned.csv')
df.drop(['FWI'], axis = 1, inplace = True)
df['Fire'] = df['Fire'].astype(int)
df.head()

Unnamed: 0,Temperature,RH,WS,Rain,FFMC,DMC,DC,ISI,BUI,Fire
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0


In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 10)

scaler = StandardScaler()
X_Train = scaler.fit_transform(X_Train)
X_Test = scaler.transform(X_Test)

## Automatic Hyperparameter Tuning and CV

### 1. Using Grid Search CV

In [17]:
# Here the model is trained on all possible combinations of parameters specified and the best model trained for best parameter is returned
# 
# Syntax :-
# GridSearchCV(
#     "estimator" : base_regression_model_object
#     "param_grid" : json(parameter: list_of_values_to_search)
#     "scoring" : method used to evaluate model performance while seach 
#               : {'accuracy', 'precision', 'recall', 'f1'}
#     "refit" : scoring method used to evaluate the model if multiple "scoring" methods is passed
#     "cv" : no_of_folds (here, StratifiedKFold)
#     "n_jobs" : no of processors
#     "error_score" : score to set for a incompatible combination occured 
#                   : ("ignore"/"raise")
# ) -> {
#     "cv_results" : overall summary  of cv
#     "best_params_" : best parameters which got selected in "params_grid"
#     "best_score_" : best score for model obtained using performance metrics specified in "scoring"
#     "best_estimator_" : returns the best model trained
# }

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# ignore the warning for failed fits (invalid search param combinations)
import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category = UserWarning)

# specify the base estimator
base_estimator = LogisticRegression(max_iter = 5000, n_jobs = -1)

# specify the range of parameters for tuning on the base_estimator
search_parameters_on_base_estimator = {
    "penalty": ['l1', 'l2', 'elasticnet'],
    "C" : [10, 5, 2, 1, 0.5, 0.1],
    "solver" : ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    "l1_ratio": [0.1, 0.5, 0.75, 1, 2, 4, 10]
}

# make gridsearchcv object
gridsearch = GridSearchCV(
    estimator = base_estimator,
    param_grid = search_parameters_on_base_estimator,
    scoring = ['recall', 'accuracy', 'precision'],
    refit = 'accuracy',
    cv = 5,
    n_jobs = -1,
    error_score = 0
)

# train the model 
# (search for best model which fits the data by trying all possible combinations of parameters)
gridsearch.fit(X_Train, Y_Train)

print('Best Parameters Selected:\n', gridsearch.best_params_)
print()
print('Best Accuracy on Test Data:', gridsearch.score(X_Test, Y_Test))
print('Best Accuracy on Train Data:', gridsearch.score(X_Train, Y_Train))

Best Parameters Selected:
 {'C': 0.1, 'l1_ratio': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

Best Accuracy on Test Data: 0.9344262295081968
Best Accuracy on Train Data: 0.989010989010989


In [19]:
# Prediction 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
Y_Pred = gridsearch.predict(X_Test)

print('Confusion Matrix:\n', confusion_matrix(Y_Test, Y_Pred))
print()

# Prediction on Test Data
print('For Test Dataset')
print('Accuracy:', accuracy_score(Y_Test, Y_Pred))
print('Precision:', precision_score(Y_Test, Y_Pred))
print('Recall:', recall_score(Y_Test, Y_Pred))
print()

# Prediction on Train Data
Y_Pred = gridsearch.predict(X_Train)
print('For Training Dataset')
print('Accuracy:', accuracy_score(Y_Train, Y_Pred))
print('Precision:', precision_score(Y_Train, Y_Pred))
print('Recall:', recall_score(Y_Train, Y_Pred))
print()

Confusion Matrix:
 [[22  1]
 [ 3 35]]

For Test Dataset
Accuracy: 0.9344262295081968
Precision: 0.9722222222222222
Recall: 0.9210526315789473

For Training Dataset
Accuracy: 0.989010989010989
Precision: 0.98989898989899
Recall: 0.98989898989899



### 2. Using Randomized Search CV

In [20]:
# Here the model is trained on random combinations of parameters specified and the best model trained for best parameter is returned
# It is little faster than GridSearchCV
# Same Syntax as GridSearchCV
#       - One difference : use param_distributions instead of param_grid

import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category = UserWarning)

# specify the base estimator
base_estimator = LogisticRegression(max_iter = 5000, n_jobs = -1)

# specify the range of parameters for tuning on the base_estimator
search_parameters_on_base_estimator = {
    "penalty": ['l1', 'l2', 'elasticnet'],
    "C" : [10, 5, 2, 1, 0.5, 0.1],
    "solver" : ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    "l1_ratio": [0.1, 0.25, 0.5, 0.75, 0.9, 1]
}

# make randomizedsearchsv object
from sklearn.model_selection import RandomizedSearchCV
randomsearch = RandomizedSearchCV(
    estimator = base_estimator,
    param_distributions = search_parameters_on_base_estimator,
    scoring = ['recall', 'accuracy', 'precision'],
    refit = 'accuracy',
    cv = 5,
    n_jobs = -1,
    error_score = 0
)

# train the model 
# (search for best model which fits the data by trying all possible combinations of parameters)
randomsearch.fit(X_Train, Y_Train)

print('Best Parameters Selected:\n', randomsearch.best_params_)
print()
print('Best Accuracy on Test Data:', randomsearch.score(X_Test, Y_Test))
print('Best Accuracy on Train Data:', randomsearch.score(X_Train, Y_Train))

Best Parameters Selected:
 {'solver': 'lbfgs', 'penalty': 'l2', 'l1_ratio': 1, 'C': 10}

Best Accuracy on Test Data: 0.9344262295081968
Best Accuracy on Train Data: 0.989010989010989


In [21]:
# Prediction 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
Y_Pred = randomsearch.predict(X_Test)

print('Confusion Matrix:\n', confusion_matrix(Y_Test, Y_Pred))
print()

# Prediction on Test Data
print('For Test Dataset')
print('Accuracy:', accuracy_score(Y_Test, Y_Pred))
print('Precision:', precision_score(Y_Test, Y_Pred))
print('Recall:', recall_score(Y_Test, Y_Pred))
print()

# Prediction on Train Data 
Y_Pred = randomsearch.predict(X_Train)
print('For Training Dataset')
print('Accuracy:', accuracy_score(Y_Train, Y_Pred))
print('Precision:', precision_score(Y_Train, Y_Pred))
print('Recall:', recall_score(Y_Train, Y_Pred))

Confusion Matrix:
 [[23  0]
 [ 4 34]]

For Test Dataset
Accuracy: 0.9344262295081968
Precision: 1.0
Recall: 0.8947368421052632

For Training Dataset
Accuracy: 0.989010989010989
Precision: 0.98989898989899
Recall: 0.98989898989899


In [22]:
# saving all the models

import pickle
pickle.dump(scaler, file = open('.\\models\\class-predictors\\scaler.pkl', 'wb'))
pickle.dump(gridsearch.best_estimator_, file = open('.\\models\\class-predictors\\gridsearch.pkl', 'wb'))
pickle.dump(randomsearch.best_estimator_, file = open('.\\models\\class-predictors\\randomsearch.pkl', 'wb'))