# Importing the necessary libraries

In [1]:
# Importing the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Mounting Google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Reading the dataset

df_unclean = pd.read_csv('/content/drive/MyDrive/Capstone/forestfires.csv')
df_unclean.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


# Getting a description of the dataset

In [4]:
# Getting a description of the dataset

df_unclean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [5]:
# Checking the shape of the dataset

print("The shape of the dataset is:", df_unclean.shape)

The shape of the dataset is: (517, 13)


# Pre-processing the data

In [6]:
# Checking for NA values

print("The number of NA values in the dataset is:", df_unclean.isna().sum().sum())

The number of NA values in the dataset is: 0


In [7]:
# Checking for skewness in the predictor variable

df_unclean.skew(axis = 0)

  df_unclean.skew(axis = 0)


X        0.036246
Y        0.417296
FFMC    -6.575606
DMC      0.547498
DC      -1.100445
ISI      2.536325
temp    -0.331172
RH       0.862904
wind     0.571001
rain    19.816344
area    12.846934
dtype: float64

In [8]:
# Correcting the skewness in the response variable

df = df_unclean.copy()
df['area'] = np.log(df['area'] + 1)
print("Previous skew of area:", df_unclean['area'].skew())
print("Current skew of area:", df['area'].skew())

Previous skew of area: 12.846933533934868
Current skew of area: 1.2178376559535011


In [9]:
# Exploring the dataset


In [10]:
# Normalizing the FWI and other climatic factors

normalizer = MinMaxScaler()
def normalize(feature):
  transform = np.array(df[feature]).reshape(-1, 1)
  df[feature] = normalizer.fit_transform(transform)

feature_list = ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']
for each in feature_list:
  normalize(each)

In [11]:
# Converting categorical values to numerical values

df['month'] = df['month'].replace(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
df['day'] = df['day'].replace(['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'], [1, 2, 3, 4, 5, 6, 7])
print("The dataset after cleaning:") 
df.head()

The dataset after cleaning:


Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,6,0.870968,0.086492,0.101325,0.090909,0.192926,0.423529,0.7,0.0,0.0
1,7,4,10,3,0.927742,0.118194,0.775419,0.11943,0.508039,0.211765,0.055556,0.0,0.0
2,7,4,10,7,0.927742,0.146795,0.796294,0.11943,0.398714,0.211765,0.1,0.0,0.0
3,8,6,3,6,0.941935,0.110958,0.081623,0.160428,0.196141,0.964706,0.4,0.03125,0.0
4,8,6,3,1,0.910968,0.172984,0.11059,0.171123,0.29582,0.988235,0.155556,0.0,0.0


# Modeling using Random Forest Regressor

In [12]:
# Splitting the dataset into train and test

X = df.drop(columns = ['area'])
y = df['area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [13]:
# Printing the shapes of the train and test datasets

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (361, 12)
y_train: (361,)
X_test: (156, 12)
y_test: (156,)


In [15]:
# Implementing Random Forest Regressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [16]:
# Code used for evaluation later

def evaluate_model(model, X_train, y_train, X_test, y_test):
  train_preds = model.predict(X_train)
  test_preds = model.predict(X_test)
  train_rmse = math.sqrt(mean_squared_error(y_train, train_preds))
  train_r2 = r2_score(y_train, train_preds)
  test_rmse = math.sqrt(mean_squared_error(y_test, test_preds))
  test_r2 = r2_score(y_test, test_preds)
  print("The train rmse using linear regression:", round(train_rmse, 3))
  print("The train r2 score using linear regression:", round(train_r2, 3))
  print("The test rmse using linear regression:", round(test_rmse, 3))
  print("The test r2 score using linear regression:", round(test_r2, 3))

In [17]:
# Evaluating the model

evaluate_model(rfr, X_train, y_train, X_test, y_test)

The train rmse using linear regression: 0.575
The train r2 score using linear regression: 0.831
The test rmse using linear regression: 1.454
The test r2 score using linear regression: -0.089


# Hyper-parameter tuning

We perform parameter tuning to improve the model as the train data fits better than our baseline models, but there is a chance of overfitting.

Using RondomizedSearchCV

In [18]:
# Getting the current parameters

rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [19]:
# Implementing hyper-parameter tuning using RandomSearchCV

bootstrap = [True, False]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)] 
max_depth.append(None)
max_features = ['auto', 'sqrt']
max_leaf_nodes = [2, 3, 4, None]
min_samples_leaf = [1, 2, 3]
min_samples_split = [2, 4, 8]
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
grid_search = {'bootstrap': bootstrap,
               'max_depth': max_depth,
               'max_features': max_features,
               'max_leaf_nodes': max_leaf_nodes,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'n_estimators': n_estimators}

In [20]:
rfr_tuning = RandomForestRegressor(random_state = 42)
rfr_random_search = RandomizedSearchCV(estimator = rfr_tuning, param_distributions = grid_search, n_iter = 5, scoring = 'r2', cv = 10, verbose = 2, random_state = 42, return_train_score = True)
rfr_random_search.fit(X_train, y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, max_features=auto, max_leaf_nodes=2, min_samples_leaf=3, min_samples_split=4, n_estimators=1; total time=   0.0s
[CV] END bootst

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42),
                   n_iter=5,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [1, 12, 23, 34, 45, 56, 67,
                                                      78, 89, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'max_leaf_nodes': [2, 3, 4, None],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 4, 8],
                                        'n_estimators': [1, 12, 23, 34, 45, 56,
                                                         67, 78, 89, 100]},
                   random_state=42, return_train_score=True, scoring='r2',
                   verbose=2)

In [21]:
# Getting the best parameters

rfr_random_search.best_params_

{'n_estimators': 89,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 3,
 'max_features': 'sqrt',
 'max_depth': 78,
 'bootstrap': False}

In [22]:
# Evaluating the model

model = rfr_random_search.best_estimator_
model.fit(X_train, y_train)
evaluate_model(model, X_train, y_train, X_test, y_test)

The train rmse using linear regression: 1.362
The train r2 score using linear regression: 0.052
The test rmse using linear regression: 1.39
The test r2 score using linear regression: 0.005


# Using GridSearchCV based on the results of random search

In [23]:
# Implementing Hyper-parameter tuning using GridSearchCV

n_estimators = list(range(80, 90))
min_samples_split = [2, 3]
max_samples_leaf = [1, 2]
max_leaf_nodes = [2, 3, None]
max_depth = list(range(70, 80))
max_depth.append(None)
parameters = {'n_estimators': n_estimators,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': max_samples_leaf,
              'max_leaf_nodes': max_leaf_nodes,
              'max_depth': max_depth,
              }
rfr_grid_search = GridSearchCV(estimator = rfr_tuning, param_grid = parameters, scoring = 'r2', cv = 10, verbose = 2, return_train_score = True)
rfr_grid_search.fit(X_train, y_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=86; total time=   0.2s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=86; total time=   0.1s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=86; total time=   0.1s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=86; total time=   0.1s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=87; total time=   0.2s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=87; total time=   0.1s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_estimators=87; total time=   0.2s
[CV] END max_depth=75, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=3, n_e

KeyboardInterrupt: ignored

In [None]:
# Getting the best parameters

rfr_grid_search.best_params_

{'max_depth': 70,
 'max_leaf_nodes': 2,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 82}

In [None]:
# Evaluating the model

model = rfr_grid_search.best_estimator_
model.fit(X_train, y_train)
evaluate_model(model, X_train, y_train, X_test, y_test)

The train rmse using linear regression: 1.352
The train r2 score using linear regression: 0.032
The test rmse using linear regression: 1.466
The test r2 score using linear regression: 0.022
