<a href="https://colab.research.google.com/github/HoussemEddineElimam/HoussemEddineElimam/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing the libraries**

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# next we will import the dataset


# **Importing the data**

In [3]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Incident_ID,Date,Location,Area_Burned (Acres),Homes_Destroyed,Businesses_Destroyed,Vehicles_Damaged,Injuries,Fatalities,Estimated_Financial_Loss (Million $),Cause
0,INC1000,2020-11-22,Sonoma County,14048,763,474,235,70,19,2270.57,Lightning
1,INC1001,2021-09-23,Sonoma County,33667,1633,4,263,100,2,1381.14,Lightning
2,INC1002,2022-02-10,Shasta County,26394,915,291,31,50,6,2421.96,Human Activity
3,INC1003,2021-05-17,Sonoma County,20004,1220,128,34,28,0,3964.16,Unknown
4,INC1004,2021-09-22,Sonoma County,40320,794,469,147,0,15,1800.09,Unknown


In [4]:
## iloc used for integer-based indexing
X = df.iloc[: , :-1].values
# we di this to create the matrix of features
# the first ":" means i selected all the rows , and the second ":-1" means i selected columns from number 0
# to the last column (the last column isn't included)

X

array([['INC1000', '2020-11-22', 'Sonoma County', 14048, 763, 474, 235,
        70, 19, 2270.57],
       ['INC1001', '2021-09-23', 'Sonoma County', 33667, 1633, 4, 263,
        100, 2, 1381.14],
       ['INC1002', '2022-02-10', 'Shasta County', 26394, 915, 291, 31,
        50, 6, 2421.96],
       ['INC1003', '2021-05-17', 'Sonoma County', 20004, 1220, 128, 34,
        28, 0, 3964.16],
       ['INC1004', '2021-09-22', 'Sonoma County', 40320, 794, 469, 147,
        0, 15, 1800.09],
       ['INC1005', '2023-05-17', 'Butte County', 48348, 60, 205, 21, 58,
        2, 4458.29],
       ['INC1006', '2018-04-29', 'San Diego County', 16038, 1404, 137,
        64, 13, 11, 713.8],
       ['INC1007', '2015-08-23', 'Napa Valley', 24519, 121, 28, 125, 0,
        5, 2001.33],
       ['INC1008', '2023-12-08', 'Sonoma County', 20418, 299, 264, 208,
        33, 4, 1012.23],
       ['INC1009', '2018-12-01', 'Butte County', 21351, 275, 196, 153,
        41, 2, 2611.9],
       ['INC1010', '2018-10-30', 'Riv

## ***Handling Missing Data***

In [5]:
print(df.isnull().sum())  # Missing values per column
print("The total missing values :")
print(df.isnull().sum().sum())  # Total missing values



Incident_ID                             0
Date                                    0
Location                                0
Area_Burned (Acres)                     0
Homes_Destroyed                         0
Businesses_Destroyed                    0
Vehicles_Damaged                        0
Injuries                                0
Fatalities                              0
Estimated_Financial_Loss (Million $)    0
Cause                                   0
dtype: int64
The total missing values :
0


## ***Acording to this results so there is no missing value in this dataset***

In [6]:
# Assume the Injuries have some missing values
X = df.iloc[:, [1, 3, 7]]
y = df.loc[:,"Cause"]
# To handle missing values with scikit-learn
from sklearn.impute import SimpleImputer
import numpy as np

# Create an imputer object with the mean strategy (works only with numeric data)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer to the selected columns
imputer.fit(X.iloc[:, [1,2]])  # Learn how to fill missing values based on the strategy (only for columns [1,2])

# Transform the selected columns with the learned statistics
X.iloc[:, [1,2]] = imputer.transform(X.iloc[:, [1,2]])  # Replace missing values with the computed statistics



## Encoding Categorical Data

In [7]:
# label encoding : Converts each category into a unique integer.
# One-Hot Encoding : Converts each category into a binary vector (1 or 0) for each possible category
# Binary Encoding : Converts categories into binary numbers, and then splits them into separate columns.
# Frequency Encoding : Replaces each category with the frequency or count of occurrences of that category in the data.
# Ordinal encoding : similar to label encoding but this it replace the category into a specific integer where the order is matters

In [8]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import  OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder="passthrough")
# X = np.array(ct.fit_transform(X))


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


# Spliting data set into training sets and test sets

In [10]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.2) # 20% observations will go to the test set and the rest will go to the training set
# the only role here is spliting the data into sets randomly so it will help us to train the machine learning model

In [11]:
## we have to apply feature scaling after spliting the dataset into train and test sets

## Feature Scaling

Standarization Or Normalization

In [15]:
from sklearn.preprocessing import StandardScaler  # Standardize features by removing the mean and scaling to unit variance.

sc = StandardScaler()

# Fit and transform X_train (apply scaling only to columns 2 and beyond)
X_train.iloc[:, 2:] = sc.fit_transform(X_train.iloc[:, 2:])

# Transform X_test using the same scaler (apply scaling only to columns 2 and beyond)
X_test.iloc[:, 2:] = sc.transform(X_test.iloc[:, 2:])


  1.54412409  0.91762983  2.13376575  1.47041888 -0.04053786 -0.11424307
  0.36484078 -1.21982118 -1.47778941  1.76523971 -0.99870556 -1.21982118
  1.6178293   0.29113557 -0.3722113  -0.59332692 -0.29850609  0.51225119
  1.72838711 -0.92500035  0.29113557 -0.26165349  1.10189285  0.07001995
 -1.18296858  0.69651421  1.80209232  0.36484078  0.14372515 -0.99870556
 -1.21982118 -1.03555816  0.03316734  0.6596616  -0.99870556 -0.18794828
 -1.03555816  0.40169338 -1.03555816  0.47539859 -0.51962171 -1.07241077
  2.20747096 -0.96185295  0.6596616   1.10189285 -0.48276911 -0.51962171
 -0.99870556 -0.29850609  0.88077723  1.17559806 -0.18794828 -1.47778941
 -0.85129514  1.50727149 -0.4459165   2.09691315 -0.81444254 -0.26165349
 -0.88814775  0.14372515 -1.21982118  0.88077723 -0.22480088  1.28615587
 -0.4459165   0.07001995 -1.21982118 -0.07739047  0.32798817  0.5859564
 -0.99870556 -1.36723159]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.i