In [80]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Algerian_forest_fires_dataset.csv')

# Inspect basic information
print(df.shape)
print(df.head())
print(df.info())
print(df.isnull().sum())


(247, 14)
  day month  year Temperature  RH  Ws Rain   FFMC  DMC    DC  ISI  BUI  FWI  \
0   1     6  2012          29  57  18     0  65.7  3.4   7.6  1.3  3.4  0.5   
1   2     6  2012          29  61  13   1.3  64.4  4.1   7.6    1  3.9  0.4   
2   3     6  2012          26  82  22  13.1  47.1  2.5   7.1  0.3  2.7  0.1   
3   4     6  2012          25  89  13   2.5  28.6  1.3   6.9    0  1.7    0   
4   5     6  2012          27  77  16     0  64.8    3  14.2  1.2  3.9  0.5   

     Classes    
0  not fire     
1  not fire     
2  not fire     
3  not fire     
4  not fire     
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          246 non-null    object
 1   month        245 non-null    object
 2   year         245 non-null    object
 3   Temperature  245 non-null    object
 4    RH          245 non-null    object
 5    Ws          245

In [81]:
# Identify rows that contain region headers or separators
# Row 122 is empty, Row 123 contains "Sidi-Bel Abbes Region Dataset"
# Row 124 contains duplicate header

# Remove these separator rows
df = df.drop([122, 123, 124])
df = df.reset_index(drop=True)


In [82]:
# Create Region column based on original index position
# Rows 0-121 (122 rows) = Bejaia Region
# Rows 125-246 (122 rows) = Sidi-Bel Abbes Region

df['Region'] = np.where(df.index < 122, 'Bejaia', 'Sidi-Bel Abbes')


In [83]:
# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Standardize column names
df = df.rename(columns={
    'Classes': 'Classes',
    'Rain': 'Rain',
    'RH': 'RH',
    'Ws': 'Ws'
})


In [84]:
# Convert numeric columns to appropriate data types
numeric_columns = ['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 
                   'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Keep Classes as object/category
df['Classes'] = df['Classes'].astype('category')


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   day          244 non-null    int64   
 1   month        244 non-null    int64   
 2   year         244 non-null    int64   
 3   Temperature  244 non-null    int64   
 4   RH           244 non-null    int64   
 5   Ws           244 non-null    int64   
 6   Rain         244 non-null    float64 
 7   FFMC         244 non-null    float64 
 8   DMC          244 non-null    float64 
 9   DC           243 non-null    float64 
 10  ISI          244 non-null    float64 
 11  BUI          244 non-null    float64 
 12  FWI          243 non-null    float64 
 13  Classes      243 non-null    category
 14  Region       244 non-null    object  
dtypes: category(1), float64(7), int64(6), object(1)
memory usage: 27.4+ KB


In [86]:
print(df.isnull().sum())

day            0
month          0
year           0
Temperature    0
RH             0
Ws             0
Rain           0
FFMC           0
DMC            0
DC             1
ISI            0
BUI            0
FWI            1
Classes        1
Region         0
dtype: int64


In [87]:
df = df.dropna()


In [88]:
print(df.isnull().sum())

day            0
month          0
year           0
Temperature    0
RH             0
Ws             0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        0
Region         0
dtype: int64


In [89]:
df['Classes'] = df['Classes'].str.strip()

In [90]:
print(df['Classes'].value_counts())

Classes
fire        137
not fire    106
Name: count, dtype: int64


In [91]:
df['Classes_Binary'] = df['Classes'].map({'fire': 1, 'not fire': 0})

In [92]:
df

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Classes_Binary
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,Bejaia,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,Bejaia,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,Bejaia,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,Bejaia,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,Bejaia,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,26,9,2012,30,65,14,0.0,85.4,16.0,44.5,4.5,16.9,6.5,fire,Sidi-Bel Abbes,1
240,27,9,2012,28,87,15,4.4,41.1,6.5,8.0,0.1,6.2,0.0,not fire,Sidi-Bel Abbes,0
241,28,9,2012,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,not fire,Sidi-Bel Abbes,0
242,29,9,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,Sidi-Bel Abbes,0


In [93]:
df.isnull().sum()

day               0
month             0
year              0
Temperature       0
RH                0
Ws                0
Rain              0
FFMC              0
DMC               0
DC                0
ISI               0
BUI               0
FWI               0
Classes           0
Region            0
Classes_Binary    0
dtype: int64

In [95]:
df.to_csv("cleaned_data.csv",index=False)

#EDA

In [96]:
df["Classes"].unique()

array(['not fire', 'fire'], dtype=object)

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243 entries, 0 to 243
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   day             243 non-null    int64  
 1   month           243 non-null    int64  
 2   year            243 non-null    int64  
 3   Temperature     243 non-null    int64  
 4   RH              243 non-null    int64  
 5   Ws              243 non-null    int64  
 6   Rain            243 non-null    float64
 7   FFMC            243 non-null    float64
 8   DMC             243 non-null    float64
 9   DC              243 non-null    float64
 10  ISI             243 non-null    float64
 11  BUI             243 non-null    float64
 12  FWI             243 non-null    float64
 13  Classes         243 non-null    object 
 14  Region          243 non-null    object 
 15  Classes_Binary  243 non-null    int64  
dtypes: float64(7), int64(7), object(2)
memory usage: 32.3+ KB


In [99]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Classes_Binary
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,Bejaia,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,Bejaia,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,Bejaia,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,Bejaia,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,Bejaia,0


In [100]:
df["Region"] = df["Region"].map({
    "Bejaia": 0,
    "Sidi-Bel Abbes": 1
})

In [101]:
df

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Classes_Binary
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,26,9,2012,30,65,14,0.0,85.4,16.0,44.5,4.5,16.9,6.5,fire,1,1
240,27,9,2012,28,87,15,4.4,41.1,6.5,8.0,0.1,6.2,0.0,not fire,1,0
241,28,9,2012,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,not fire,1,0
242,29,9,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,1,0


In [None]:
pd.to_