In [1]:
from dataclasses import replace

import pandas as pd
import seaborn as sns
import numpy as np


In [2]:
df = pd.read_csv('Algerian_forest_fires_dataset.csv')

In [3]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [4]:
df.isnull().sum()

day            1
month          2
year           2
Temperature    2
 RH            2
 Ws            2
Rain           2
FFMC           2
DMC            2
DC             2
ISI            2
BUI            2
FWI            2
Classes        3
dtype: int64

In [5]:
df.dropna(axis = 0, inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 246
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          244 non-null    object
 1   month        244 non-null    object
 2   year         244 non-null    object
 3   Temperature  244 non-null    object
 4    RH          244 non-null    object
 5    Ws          244 non-null    object
 6   Rain         244 non-null    object
 7   FFMC         244 non-null    object
 8   DMC          244 non-null    object
 9   DC           244 non-null    object
 10  ISI          244 non-null    object
 11  BUI          244 non-null    object
 12  FWI          244 non-null    object
 13  Classes      244 non-null    object
dtypes: object(14)
memory usage: 28.6+ KB


In [7]:
df.columns

Index(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes  '],
      dtype='object')

In [8]:
df.shape

(244, 14)

In [9]:
# Remove blank spaces from column headers
df.columns = df.columns.str.strip()  # Removes leading/trailing spaces

In [10]:
df.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes'],
      dtype='object')

In [11]:
#errors='coerce': Converts invalid values (e.g., strings like "abc") to NaN.
df['day'] = pd.to_numeric(df['day'], errors='coerce').astype('Int64')
df['month'] = pd.to_numeric(df['month'], errors='coerce').astype('Int64')
df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
df['Temperature'] = pd.to_numeric(df['Temperature'], errors='coerce').astype('Int64')
df['RH'] = pd.to_numeric(df['RH'], errors='coerce').astype('Int64')
df['Ws'] = pd.to_numeric(df['Ws'], errors='coerce').astype('Int64')



In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 246
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          243 non-null    Int64 
 1   month        243 non-null    Int64 
 2   year         243 non-null    Int64 
 3   Temperature  243 non-null    Int64 
 4   RH           243 non-null    Int64 
 5   Ws           243 non-null    Int64 
 6   Rain         244 non-null    object
 7   FFMC         244 non-null    object
 8   DMC          244 non-null    object
 9   DC           244 non-null    object
 10  ISI          244 non-null    object
 11  BUI          244 non-null    object
 12  FWI          244 non-null    object
 13  Classes      244 non-null    object
dtypes: Int64(6), object(8)
memory usage: 30.0+ KB


In [13]:
df['Rain'] = pd.to_numeric(df['day'], errors='coerce').astype('float')
df['FFMC'] = pd.to_numeric(df['month'], errors='coerce').astype('float')
df['DMC'] = pd.to_numeric(df['year'], errors='coerce').astype('float')
df['DC'] = pd.to_numeric(df['Temperature'], errors='coerce').astype('float')
df['ISI'] = pd.to_numeric(df['RH'], errors='coerce').astype('float')
df['BUI'] = pd.to_numeric(df['Ws'], errors='coerce').astype('float')
df['FWI'] = pd.to_numeric(df['day'], errors='coerce').astype('float')


In [14]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,1.0,6.0,2012.0,29.0,57.0,18.0,1.0,not fire
1,2,6,2012,29,61,13,2.0,6.0,2012.0,29.0,61.0,13.0,2.0,not fire
2,3,6,2012,26,82,22,3.0,6.0,2012.0,26.0,82.0,22.0,3.0,not fire
3,4,6,2012,25,89,13,4.0,6.0,2012.0,25.0,89.0,13.0,4.0,not fire
4,5,6,2012,27,77,16,5.0,6.0,2012.0,27.0,77.0,16.0,5.0,not fire


In [15]:
df['Classes'].unique()

array(['not fire   ', 'fire   ', 'fire', 'fire ', 'not fire', 'not fire ',
       'Classes  ', 'not fire     ', 'not fire    '], dtype=object)

In [16]:
# Convert 'Classes' to string to ensure consistency and handle non-string values
df['Classes'] = df['Classes'].astype(str)

# Remove leading/trailing spaces and internal spaces
df['Classes'] = df['Classes'].str.strip().str.replace(' ', '')

df['Classes'] = df['Classes'].map({'fire': 1, 'notfire': 0})

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 246
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          243 non-null    Int64  
 1   month        243 non-null    Int64  
 2   year         243 non-null    Int64  
 3   Temperature  243 non-null    Int64  
 4   RH           243 non-null    Int64  
 5   Ws           243 non-null    Int64  
 6   Rain         243 non-null    float64
 7   FFMC         243 non-null    float64
 8   DMC          243 non-null    float64
 9   DC           243 non-null    float64
 10  ISI          243 non-null    float64
 11  BUI          243 non-null    float64
 12  FWI          243 non-null    float64
 13  Classes      243 non-null    float64
dtypes: Int64(6), float64(8)
memory usage: 30.0 KB


In [18]:
df['Classes'] = df['Classes'].astype('Int64')

In [19]:
df.tail()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
242,26,9,2012,30,65,14,26.0,9.0,2012.0,30.0,65.0,14.0,26.0,1
243,27,9,2012,28,87,15,27.0,9.0,2012.0,28.0,87.0,15.0,27.0,0
244,28,9,2012,27,87,29,28.0,9.0,2012.0,27.0,87.0,29.0,28.0,0
245,29,9,2012,24,54,18,29.0,9.0,2012.0,24.0,54.0,18.0,29.0,0
246,30,9,2012,24,64,15,30.0,9.0,2012.0,24.0,64.0,15.0,30.0,0


In [20]:
df.iloc[122]


day            <NA>
month          <NA>
year           <NA>
Temperature    <NA>
RH             <NA>
Ws             <NA>
Rain           <NA>
FFMC           <NA>
DMC            <NA>
DC             <NA>
ISI            <NA>
BUI            <NA>
FWI            <NA>
Classes        <NA>
Name: 124, dtype: Float64

In [21]:
# Remove rows after index 122 (keep rows 0 through 122)
df = df.loc[:122]

In [22]:
# Define columns to drop (case-insensitive)
cols_to_drop = [col for col in df.columns if col.lower() in ['day', 'month', 'year']]

# Drop the columns if they exist
if cols_to_drop:
    df.drop(cols_to_drop, axis=1, inplace=True)

In [23]:
# Print original columns for verification
print("Original columns:", df.columns.tolist())
print("Original shape:", df.shape)

Original columns: ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes']
Original shape: (122, 11)


In [24]:
# Save the updated dataset
df.to_csv('updated_ff_dataset.csv', index=False)

In [25]:
df

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,29,57,18,1.0,6.0,2012.0,29.0,57.0,18.0,1.0,0
1,29,61,13,2.0,6.0,2012.0,29.0,61.0,13.0,2.0,0
2,26,82,22,3.0,6.0,2012.0,26.0,82.0,22.0,3.0,0
3,25,89,13,4.0,6.0,2012.0,25.0,89.0,13.0,4.0,0
4,27,77,16,5.0,6.0,2012.0,27.0,77.0,16.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...
117,31,54,11,26.0,9.0,2012.0,31.0,54.0,11.0,26.0,0
118,31,66,11,27.0,9.0,2012.0,31.0,66.0,11.0,27.0,1
119,32,47,14,28.0,9.0,2012.0,32.0,47.0,14.0,28.0,0
120,26,80,16,29.0,9.0,2012.0,26.0,80.0,16.0,29.0,0
