Forest Fire

In [516]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import Image
from xgboost import XGBRegressor


In [517]:
df = pd.read_csv('forestfires.csv')
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [518]:
fig = px.histogram(df, x='month', y='area', title='Burned Area by Month',
              labels={'area': 'Burned Area (ha)'})
fig.show()

In [519]:
fig = px.histogram(df, x='day', y='area', title='Burned Area by Day of the Week',
              labels={'area': 'Burned Area (ha)'})
fig.show()

In [520]:
fig = px.scatter(df, x='X', y='Y', size='area', color='area', title='Spatial Distribution of Burned Area',
                  labels={'X': 'X Coordinate', 'Y': 'Y Coordinate', 'area': 'Burned Area (ha)'}, size_max=15)
fig.show()


In [521]:
df = pd.get_dummies(df, columns=['month', 'day'])
df

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_nov,month_oct,month_sep,day_fri,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,False,False,False,True,False,False,False,False,False,False
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,False,True,False,False,False,False,False,False,True,False
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,False,True,False,False,False,True,False,False,False,False
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,False,False,False,True,False,False,False,False,False,False
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,...,False,False,False,False,False,False,True,False,False,False
513,2,4,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,...,False,False,False,False,False,False,True,False,False,False
514,7,4,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,...,False,False,False,False,False,False,True,False,False,False
515,1,4,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,...,False,False,False,False,False,True,False,False,False,False


In [522]:
#df['area'] = np.log(df['area'] + 1)

In [523]:
y = df['area']
X = df.drop('area', axis=1)

In [524]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (413, 29)
X_test shape: (104, 29)
y_train shape: (413,)
y_test shape: (104,)


In [525]:
pipe = make_pipeline(StandardScaler(), SGDRegressor(random_state=62)).fit(X_train, y_train)
mse_scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-mse_scores)
print("Default SGD RMSE (per fold):", rmse_scores)
print("Default SGD Average RMSE:", rmse_scores.mean())

y_pred = pipe.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_pred)
print(f"\nDefault SGD Test Set RMSE: {test_rmse}")
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"Default SGD Test Set R²: {r2}")

Default SGD RMSE (per fold): [34.25890009 39.12824718 85.94383867 27.06885441 31.20437662]
Default SGD Average RMSE: 43.52084339325296

Default SGD Test Set RMSE: 106.04389321637258
Default SGD Test Set R²: 0.018795495178114074
