In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import pickle

In [2]:
# import dataset
data = pd.read_csv("Algerian_forest_fires_cleaned.csv")
data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


In [3]:
## clean dataset
data.isnull().any()

day            False
month          False
year           False
Temperature    False
RH             False
Ws             False
Rain           False
FFMC           False
DMC            False
DC             False
ISI            False
BUI            False
FWI            False
Classes        False
Region         False
dtype: bool

In [4]:
## drop columns
currentData = data.drop(['day','month','year'], axis=1)
currentData.head()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


In [5]:
# handling classes
currentData['Classes'] = np.where(currentData['Classes'].str.contains('not fire'),0,1)
currentData.head()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0,0
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0,0
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0,0
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0,0
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0,0


In [6]:
## definer independant and dependant variables
X = currentData[['Temperature','RH','Ws','Rain','FFMC','DMC','DC','ISI','BUI','Classes','Region']]
y = currentData['FWI']

In [7]:
# Split data into training and test
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [8]:
# Create Model and train with the above data
model = LinearRegression()
model.fit(X_train,y_train)

In [9]:
# making predictions on test data
y_pred = model.predict(X_test)

In [10]:
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [11]:
# evaluation of model
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.33231115384863497
R-squared (R2): 0.9890356825759858


In [12]:
# print coefficients and intercept
print("Cofficients:", model.coef_)
print("Intercept:", model.intercept_)

Cofficients: [-0.0187748  -0.01376316 -0.03263954  0.00436021 -0.06238805  0.06945259
 -0.00416843  1.17409784  0.20492049  0.59372295 -0.35898342]
Intercept: 3.897409065911618


In [13]:
# save trained model to pickle file
with open('linear_regression_model.pkl','wb') as file:
    pickle.dump(model,file)

In [14]:
# unseen data
x_new=pd.DataFrame({
 'Temperature': [24, 26, 28, 21, 26],
    'RH': [67, 78, 89, 69, 78],
    'Ws': [18, 23, 17, 19, 20],
    'Rain': [1, 3, 14, 14, 6],
    'FFMC': [65.2,45.4,23.4,76.3,65.2],
    'DMC': [4.5,6.5,7.5,8.5,11.2],
    'DC': [7.6,5.4,3.2,8.2,9.2],
    'ISI': [0.5,0.6,0.8,0.9,1.1],
    'BUI':[3.4,6.5,4.3,6.4,7.1],
    'Classes':[0,1,0,0,1],	
    'Region':[0,1,1,1,0]
})

y_new = pd.DataFrame({'FWI': [0.1,0.2,0.3,0.5,0.3]})

In [15]:
# load the model from pickle file

with open('linear_regression_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [16]:
predictions = loaded_model.predict(x_new)
y_true = y_new

In [17]:
print(predictions)

[-0.56153478  1.46580543  2.16209232 -0.2004744   1.72109313]


In [18]:
mae = mean_absolute_error(y_true, predictions)
mse = mean_squared_error(y_true, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error: 1.1822000118178035
Mean Squared Error: 1.603489903471501
Root Mean Squared Error: 1.2662898181188622
R-squared (R2) Score: -90.10738087906256


Summary:

The multiple linear regression model demonstrated moderate predictive capability for Algerian forest fire intensity, revealing important influencing factors. While it offers interpretability and simplicity, it assumes linear relationships and is sensitive to outliers. To improve accuracy, consider advanced regression techniques, data quality enhancements, expert input, and continuous model monitoring.