In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [2]:
caminho = "../dados/CO2_Emissions_Canada.csv"
df_raw = pd.read_csv(caminho)
df_raw.shape

(7385, 12)

In [3]:
# Remover duplicados
df_trat = df_raw.drop_duplicates()
print(f"Dataset após remover duplicatas: {df_trat.shape}")

# Remover coluna redundante
base = df_trat.drop(columns=['Fuel Consumption Comb (mpg)'])

Dataset após remover duplicatas: (6282, 12)


In [4]:
# Separar target e features
target = 'CO2 Emissions(g/km)'

In [5]:
# Features numéricas
num_features = [
    'Engine Size(L)',
    'Cylinders',
    'Fuel Consumption City (L/100 km)',
    'Fuel Consumption Hwy (L/100 km)',
    'Fuel Consumption Comb (L/100 km)'
]

# Features categóricas
cat_features = [
    'Make',
    'Model',
    'Vehicle Class',
    'Transmission',
    'Fuel Type'
]

features = num_features + cat_features

In [6]:
X = base[features]
y = base[target]

In [7]:
print("Features (X):")
print(X.head())

Features (X):
   Engine Size(L)  Cylinders  Fuel Consumption City (L/100 km)  \
0             2.0          4                               9.9   
1             2.4          4                              11.2   
2             1.5          4                               6.0   
3             3.5          6                              12.7   
4             3.5          6                              12.1   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)   Make  \
0                              6.7                               8.5  ACURA   
1                              7.7                               9.6  ACURA   
2                              5.8                               5.9  ACURA   
3                              9.1                              11.1  ACURA   
4                              8.7                              10.6  ACURA   

        Model Vehicle Class Transmission Fuel Type  
0         ILX       COMPACT          AS5         Z  
1       

In [8]:
print("\nTarget (y):")
print(y.head())


Target (y):
0    196
1    221
2    136
3    255
4    244
Name: CO2 Emissions(g/km), dtype: int64


### Transformar categóricas em numéricas (One-Hot Encoding ou Label Encoding)  
    Usamos o one hot encoder pois não há uma sequência lógica nos dados categóricos.


In [9]:
X = pd.get_dummies(X, columns=cat_features)

In [10]:
X

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Make_ACURA,Make_ALFA ROMEO,Make_ASTON MARTIN,Make_AUDI,Make_BENTLEY,...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,2.0,4,9.9,6.7,8.5,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,2.4,4,11.2,7.7,9.6,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,1.5,4,6.0,5.8,5.9,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,3.5,6,12.7,9.1,11.1,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,3.5,6,12.1,8.7,10.6,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7380,2.0,4,10.7,7.7,9.4,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7381,2.0,4,11.2,8.3,9.9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7382,2.0,4,11.7,8.6,10.3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7383,2.0,4,11.2,8.3,9.9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Padronizar ou normalizar features numéricas
    A normalização neste caso não é bem vista, visto que é sensível a outliers então um carro com um consumo muito alto de combustivel estaria em uma mesma escala de carros com combustiveis normais, já a Padronização é ótimo para esses tipos de caso.


In [11]:
scaler = StandardScaler()

In [12]:
X[num_features] = scaler.fit_transform(X[num_features])

In [13]:
X

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Make_ACURA,Make_ALFA ROMEO,Make_ASTON MARTIN,Make_AUDI,Make_BENTLEY,...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,-0.851086,-0.876934,-0.762844,-1.040321,-0.854490,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,-0.558066,-0.876934,-0.396934,-0.601475,-0.481184,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,-1.217362,-0.876934,-1.860575,-1.435283,-1.736851,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,0.247740,0.206429,0.025270,0.012910,0.027870,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,0.247740,0.206429,-0.143611,-0.162629,-0.141815,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7380,-0.851086,-0.876934,-0.537668,-0.601475,-0.549058,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7381,-0.851086,-0.876934,-0.396934,-0.338167,-0.379373,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7382,-0.851086,-0.876934,-0.256199,-0.206513,-0.243625,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7383,-0.851086,-0.876934,-0.396934,-0.338167,-0.379373,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Dividir dataset em treino/teste
### Treinar o modelo de regressão