In [25]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [26]:
data = pd.read_csv('Datasets/Algerian_forest_fires_Source.csv', header=1)
data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [27]:
data.shape

(246, 14)

#### Observation: Since as per given Data Information 122 instances for each region are present but we got 246, so we need to delete another 2 records to get 244.

In [28]:
data['day'].unique()

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31',
       'Sidi-Bel Abbes Region Dataset', 'day'], dtype=object)

In [29]:
data[data['day']=='Sidi-Bel Abbes Region Dataset']

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
122,Sidi-Bel Abbes Region Dataset,,,,,,,,,,,,,


In [30]:
data[data['day']=='day']

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
123,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes


In [31]:
data = data.drop([122,123]).reset_index(drop=True)

#### Observation: Now we got 244 instances after deleting 2 unrelated rows.

In [32]:
data['day'].unique()

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

#### Observation: Need to fix the extra white spaces on certain column names also values inside FWI, DC and Classes feature.

In [33]:
data.columns = data.columns.str.strip()
data.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes'],
      dtype='object')

In [34]:
# Checking White spaces on all values inside the Dataframe.
for i in data.columns:
    if i not in 'Classes':
        for j in data[i]:
            if " " in j:
                print(f"Column {i} = {j}")

Column DC = 14.6 9
Column FWI = 7.1 
Column FWI = fire   


In [35]:
data['Classes'].unique()

array(['not fire   ', 'fire   ', 'fire', 'fire ', 'not fire', 'not fire ',
       'not fire     ', nan, 'not fire    '], dtype=object)

In [36]:
data['Classes'] = data['Classes'].str.strip()

In [37]:
data['FWI'] = data['FWI'].str.replace(' ','')

In [38]:
data['DC'] = data['DC'].str.replace(' ','')

#### Since there was an misplaced value 'fire' on FWI Data, we need to fix it.

In [39]:
data[data['Classes'].isnull()]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
165,14,7,2012,37,37,18,0.2,88.9,12.9,14.69,12.5,10.4,fire,


In [40]:
data['FWI']=data['FWI'].replace('fire', np.nan)

In [42]:
data['FWI'] = data['FWI'].astype(float)

#### Filling Median value to Nan:

In [43]:
median = data['FWI'].median()
data['FWI'] = data['FWI'].fillna(median) 

In [44]:
data.loc[[165]]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
165,14,7,2012,37,37,18,0.2,88.9,12.9,14.69,12.5,10.4,4.2,


#### Need to add Region Column for the reference :

In [45]:
data.loc[:122, 'Region']=1
data.loc[122:, 'Region']=2

In [46]:
data.head(2)

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,1.0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,1.0


In [47]:
data.tail(2)

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
242,29,9,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,2.0
243,30,9,2012,24,64,15,0.2,67.3,3.8,16.5,1.2,4.8,0.5,not fire,2.0


#### Changing Categorical feature Classes to 0 and 1:

In [48]:
# In actual data there is a misplaced value 
data['Classes'] = data.Classes.fillna('fire')

In [49]:
data['Classes'] = np.where(data['Classes']=='fire', 1, 0)

In [52]:
data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0,1.0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0,1.0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0,1.0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0,1.0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0,1.0


In [53]:
data.isnull().sum()

day            0
month          0
year           0
Temperature    0
RH             0
Ws             0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        0
Region         0
dtype: int64

#### Data type conversion on all features:

In [54]:
for i in data.columns:
    if "." in str(data[i][2]):
        data[i] = data[i].astype(float)
    else:
        data[i] = data[i].astype(int)

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          244 non-null    int32  
 1   month        244 non-null    int32  
 2   year         244 non-null    int32  
 3   Temperature  244 non-null    int32  
 4   RH           244 non-null    int32  
 5   Ws           244 non-null    int32  
 6   Rain         244 non-null    float64
 7   FFMC         244 non-null    float64
 8   DMC          244 non-null    float64
 9   DC           244 non-null    float64
 10  ISI          244 non-null    float64
 11  BUI          244 non-null    float64
 12  FWI          244 non-null    float64
 13  Classes      244 non-null    int32  
 14  Region       244 non-null    float64
dtypes: float64(8), int32(7)
memory usage: 22.0 KB


#### Export to excel file:

In [58]:
data.to_excel("Datasets/Data_Cleaned.xlsx", index=False)

In [59]:
data.to_csv("Datasets/Data_Cleaned", index=False)