### I) load dataset

In [18]:
import pandas as pd

df = pd.read_csv("../data/FuelConsumption.csv")

In [19]:
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33.0,196.0
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29.0,221.0
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48.0,136.0
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25.0,255.0
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27.0,244.0


### II) Checking missing values 

In [22]:
df.isnull().sum().sort_values(ascending = False)

FUELCONSUMPTION_COMB        4
CO2EMISSIONS                3
FUELCONSUMPTION_CITY        2
FUELCONSUMPTION_COMB_MPG    1
MODELYEAR                   0
ENGINESIZE                  0
VEHICLECLASS                0
MODEL                       0
MAKE                        0
FUELTYPE                    0
TRANSMISSION                0
CYLINDERS                   0
FUELCONSUMPTION_HWY         0
dtype: int64

In [27]:
# Missing Values '''
df_mv = df.isnull().sum().sort_values(ascending = False)
df_MVpc = (df_mv/len(df)) * 100
df_m = pd.concat([df_mv,df_MVpc], axis = 1, keys = ['Missing Values','% Missing'])
df_m.head(10)

Unnamed: 0,Missing Values,% Missing
FUELCONSUMPTION_COMB,4,0.374883
CO2EMISSIONS,3,0.281162
FUELCONSUMPTION_CITY,2,0.187441
FUELCONSUMPTION_COMB_MPG,1,0.093721
MODELYEAR,0,0.0
ENGINESIZE,0,0.0
VEHICLECLASS,0,0.0
MODEL,0,0.0
MAKE,0,0.0
FUELTYPE,0,0.0


### III)  numerical Feature normalization 

. MinMaxScaler (Min-Max scaling):

+ When to use: Use when you need to bound your data within a specific range, often [0, 1]. This is useful for certain algorithms (e.g., some neural networks) or when you need features to have a specific range for interpretability.
+ How it works: Scales features to a range between the minimum and maximum values in the dataset.
    + Pros: Preserves the shape of the original distribution. Useful when a specific range is required.
    + Cons: Sensitive to outliers. Outliers can compress the range of the scaled values. Not suitable for algorithms that assume a normal distribution.

In [31]:
from sklearn.preprocessing import MinMaxScaler
num_feature=df.select_dtypes(include="number")
scaled_frame = pd.DataFrame(
    MinMaxScaler().fit_transform(num_feature),
    columns=num_feature.columns)
scaled_frame.head()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,0.0,0.135135,0.111111,0.207031,0.115385,0.180095,0.44898,0.231579
1,0.0,0.189189,0.111111,0.257812,0.179487,0.232227,0.367347,0.297368
2,0.0,0.067568,0.111111,0.054688,0.057692,0.056872,0.755102,0.073684
3,0.0,0.337838,0.333333,0.316406,0.269231,0.303318,0.285714,0.386842
4,0.0,0.337838,0.333333,0.292969,0.24359,0.279621,0.326531,0.357895
