# Utilização do Otimizador AdamW ao invés do Adam


No AdamW, o decaimento de peso é tratado separadamente, ajudando o modelo a generalizar melhor, evitando que ele fique "viciado" nos dados de treino.

In [2]:
import pandas as pd
import tensorflow as tf
import sklearn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder # Utilizado para processamento de colunas categóricas
from sklearn.compose import ColumnTransformer

In [3]:
# https://www.kaggle.com/datasets/vfsousas/autos
base = pd.read_csv('autos.csv', encoding = 'ISO-8859-1')
base

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371523,2016-03-14 17:48:27,Suche_t4___vito_ab_6_sitze,privat,Angebot,2200,test,,2005,,0,,20000,1,,sonstige_autos,,2016-03-14 00:00:00,0,39576,2016-04-06 00:46:52
371524,2016-03-05 19:56:21,Smart_smart_leistungssteigerung_100ps,privat,Angebot,1199,test,cabrio,2000,automatik,101,fortwo,125000,3,benzin,smart,nein,2016-03-05 00:00:00,0,26135,2016-03-11 18:17:12
371525,2016-03-19 18:57:12,Volkswagen_Multivan_T4_TDI_7DC_UY2,privat,Angebot,9200,test,bus,1996,manuell,102,transporter,150000,3,diesel,volkswagen,nein,2016-03-19 00:00:00,0,87439,2016-04-07 07:15:26
371526,2016-03-20 19:41:08,VW_Golf_Kombi_1_9l_TDI,privat,Angebot,3400,test,kombi,2002,manuell,100,golf,150000,6,diesel,volkswagen,,2016-03-20 00:00:00,0,40764,2016-03-24 12:45:21


In [4]:
# Remoção dos atributos que não irão influenciar no que desejamos fazer (prever o preço do carro)
base = base.drop('dateCrawled', axis = 1) # axis = 1 -> coluna; axis = 0 -> linha
base = base.drop('dateCreated', axis = 1)
base = base.drop('nrOfPictures', axis = 1)
base = base.drop('postalCode', axis = 1)
base = base.drop('lastSeen', axis = 1)
base

Unnamed: 0,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
1,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371523,Suche_t4___vito_ab_6_sitze,privat,Angebot,2200,test,,2005,,0,,20000,1,,sonstige_autos,
371524,Smart_smart_leistungssteigerung_100ps,privat,Angebot,1199,test,cabrio,2000,automatik,101,fortwo,125000,3,benzin,smart,nein
371525,Volkswagen_Multivan_T4_TDI_7DC_UY2,privat,Angebot,9200,test,bus,1996,manuell,102,transporter,150000,3,diesel,volkswagen,nein
371526,VW_Golf_Kombi_1_9l_TDI,privat,Angebot,3400,test,kombi,2002,manuell,100,golf,150000,6,diesel,volkswagen,


In [5]:
base['name'].value_counts()

name
Ford_Fiesta                                              657
BMW_318i                                                 627
Opel_Corsa                                               622
Volkswagen_Golf_1.4                                      603
BMW_316i                                                 523
                                                        ... 
Audi_A4_Avant_Klima_Gruene_Plakette_TÜV_&AU_NEU_XENON      1
Renault_clio_in_gold_450VB_!!                              1
Fiat_Doblo_1.6_Multijet                                    1
Renault_Laguna_1                                           1
BMW_M135i_vollausgestattet_NP_52.720____Euro               1
Name: count, Length: 233531, dtype: int64

In [6]:
base = base.drop('name', axis = 1)
# Foi decidido remover a coluna 'name' pois ela possui muitos valores únicos, o que pode atrapalhar o treinamento do modelo
# Além disso, existem outras colunas que indicam modelo e marca do carro

In [7]:
base['seller'].value_counts()

seller
privat        371525
gewerblich         3
Name: count, dtype: int64

In [8]:
base = base.drop('seller', axis = 1)
# A coluna 'seller' possui apenas 3 valores 'privat', o que não é interessante para o treinamento do modelo
# manter essa coluna implicará que o modelo não conseguirá prever o preço de carros em revendas

In [9]:
base['offerType'].value_counts()

offerType
Angebot    371516
Gesuch         12
Name: count, dtype: int64

In [10]:
base = base.drop('offerType', axis = 1)
# Esse é o mesmo caso da coluna 'seller', a coluna 'offerType' possui apenas 12 valores 'Angebot'

In [11]:
base

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
1,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein
...,...,...,...,...,...,...,...,...,...,...,...,...
371523,2200,test,,2005,,0,,20000,1,,sonstige_autos,
371524,1199,test,cabrio,2000,automatik,101,fortwo,125000,3,benzin,smart,nein
371525,9200,test,bus,1996,manuell,102,transporter,150000,3,diesel,volkswagen,nein
371526,3400,test,kombi,2002,manuell,100,golf,150000,6,diesel,volkswagen,


In [12]:
base.loc[base['price'] <= 10]
# Existem mais de 12 mil registros com preço menor ou igual a 10,00

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
7,0,test,limousine,1980,manuell,50,andere,40000,7,benzin,volkswagen,nein
40,0,test,,1990,,0,corsa,150000,1,benzin,opel,
60,1,control,suv,1994,manuell,286,,150000,11,,sonstige_autos,
91,1,control,limousine,1995,manuell,113,e_klasse,150000,4,diesel,mercedes_benz,nein
115,0,test,,2017,manuell,0,golf,5000,12,benzin,volkswagen,
...,...,...,...,...,...,...,...,...,...,...,...,...
371356,0,control,,2000,manuell,65,corsa,150000,0,,opel,ja
371392,0,test,kleinwagen,2002,manuell,60,fiesta,150000,3,benzin,ford,
371402,0,control,kleinwagen,1999,manuell,53,swift,150000,3,benzin,suzuki,
371431,0,control,kleinwagen,1999,manuell,37,arosa,150000,7,benzin,seat,ja


In [13]:
base['price'].mean()
# Uma solução seria calcular a média dos preços e inserir esse valor nos registros com preço menor ou igual a 10,00
# Mas não é uma boa prática, pois pode distorcer o modelo
# Logo, iremos remover esses registros por não representarem um grande montante

17295.14186548524

In [14]:
base = base[base['price'] > 10] # Removendo registros com preço menor ou igual a 10,00

In [16]:
base.loc[base['price'] > 350000] # Verificando registros com preço maior que 350.000,00

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1846,579000,control,coupe,1980,manuell,277,andere,20000,12,benzin,bmw,nein
10649,420000,control,coupe,2004,manuell,483,911,50000,4,benzin,porsche,nein
14663,11111111,control,coupe,2003,manuell,64,polo,150000,2,benzin,volkswagen,
16889,1000000,control,kombi,1998,,0,mondeo,150000,0,benzin,ford,ja
20143,1250000,test,coupe,2016,manuell,500,911,5000,3,benzin,porsche,nein
...,...,...,...,...,...,...,...,...,...,...,...,...
364171,3890000,test,coupe,2006,,799,,5000,7,,sonstige_autos,nein
365461,599000,control,coupe,1980,manuell,377,andere,5000,3,benzin,bmw,nein
366653,99999999,control,cabrio,1996,manuell,192,3er,150000,0,,bmw,
366861,3895000,test,coupe,2006,,799,,5000,4,benzin,sonstige_autos,nein


In [17]:
base = base.loc[base['price'] < 350000] # Removendo registros com preço maior que 350.000,00 (valores incosisntentes)

In [18]:
base.loc[pd.isnull(base['vehicleType'])] # Identificando registros com valores nulos na coluna 'vehicleType'

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
16,300,test,,2016,,60,polo,150000,0,benzin,volkswagen,
22,2900,test,,2018,manuell,90,meriva,150000,5,benzin,opel,nein
26,5555,control,,2017,manuell,125,c4,125000,4,,citroen,nein
31,899,control,,2016,manuell,60,clio,150000,6,benzin,renault,
...,...,...,...,...,...,...,...,...,...,...,...,...
371495,180,control,,1995,,0,,125000,3,benzin,opel,
371504,2600,control,,2005,automatik,0,c_klasse,150000,9,,mercedes_benz,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein
371519,5250,control,,2016,automatik,150,159,150000,12,,alfa_romeo,nein


In [19]:
base['vehicleType'].value_counts()

vehicleType
limousine     93614
kleinwagen    78014
kombi         65921
bus           29699
cabrio        22509
coupe         18386
suv           14477
andere         3125
Name: count, dtype: int64

In [20]:
base['vehicleType'].mode()
# Utilizaremos a moda para preencher os valores nulos da coluna 'vehicleType'

0    limousine
Name: vehicleType, dtype: object

In [21]:
base.loc[pd.isnull(base['gearbox'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
15,450,test,kleinwagen,1910,,0,ka,5000,0,benzin,ford,
16,300,test,,2016,,60,polo,150000,0,benzin,volkswagen,
32,245,test,limousine,1994,,0,golf,150000,2,benzin,volkswagen,nein
37,1500,test,,2016,,0,kangoo,150000,1,diesel,renault,nein
70,1200,test,coupe,2001,,0,astra,150000,0,,opel,
...,...,...,...,...,...,...,...,...,...,...,...,...
371443,3300,control,kombi,2006,,0,touran,150000,7,diesel,volkswagen,
371460,3500,control,,1995,,0,polo,150000,0,,volkswagen,
371486,350,control,kleinwagen,1996,,65,punto,150000,0,,fiat,
371495,180,control,,1995,,0,,125000,3,benzin,opel,


In [22]:
base['gearbox'].value_counts()
# Utilizaremos a moda para preencher os valores nulos da coluna 'gearbox'

gearbox
manuell      266547
automatik     75508
Name: count, dtype: int64

In [23]:
base.loc[pd.isnull(base['model'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
83,350,control,kleinwagen,1997,manuell,54,,150000,3,,fiat,ja
139,1450,control,limousine,1992,manuell,136,,150000,0,,audi,nein
156,6799,control,kleinwagen,2009,,60,,20000,5,benzin,volkswagen,nein
165,500,control,kleinwagen,1999,manuell,0,,150000,0,benzin,renault,nein
...,...,...,...,...,...,...,...,...,...,...,...,...
371399,560,control,kleinwagen,2001,automatik,170,,90000,0,benzin,fiat,ja
371476,9400,control,kombi,2007,manuell,200,,150000,4,diesel,sonstige_autos,ja
371495,180,control,,1995,,0,,125000,3,benzin,opel,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein


In [24]:
base['model'].value_counts()

model
golf               28989
andere             25560
3er                19905
polo               12604
corsa              12149
                   ...  
serie_2                8
rangerover             6
serie_3                3
serie_1                1
discovery_sport        1
Name: count, Length: 251, dtype: int64

In [25]:
base.loc[pd.isnull(base['fuelType'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
9,999,test,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,
13,2500,control,kombi,2004,manuell,131,passat,150000,2,,volkswagen,nein
26,5555,control,,2017,manuell,125,c4,125000,4,,citroen,nein
36,1600,control,andere,1991,manuell,75,kadett,70000,0,,opel,
41,7500,control,limousine,2002,automatik,306,e_klasse,150000,4,,mercedes_benz,
...,...,...,...,...,...,...,...,...,...,...,...,...
371496,3850,test,cabrio,2006,manuell,108,2_reihe,125000,2,,peugeot,nein
371504,2600,control,,2005,automatik,0,c_klasse,150000,9,,mercedes_benz,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein
371519,5250,control,,2016,automatik,150,159,150000,12,,alfa_romeo,nein


In [26]:
base['fuelType'].value_counts()

fuelType
benzin     217582
diesel     106002
lpg          5222
cng           557
hybrid        271
andere        165
elektro       101
Name: count, dtype: int64

In [27]:
base.loc[pd.isnull(base['notRepairedDamage'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
2,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
8,14500,control,bus,2014,manuell,125,c_max,30000,8,benzin,ford,
9,999,test,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,
12,999,control,kombi,1995,manuell,115,passat,150000,11,benzin,volkswagen,
...,...,...,...,...,...,...,...,...,...,...,...,...
371507,5999,test,kombi,2005,manuell,140,a4,150000,4,diesel,audi,
371514,999,control,cabrio,2000,manuell,95,megane,150000,4,benzin,renault,
371515,1690,test,kombi,2004,manuell,55,fabia,150000,4,benzin,skoda,
371523,2200,test,,2005,,0,,20000,1,,sonstige_autos,


In [28]:
base['notRepairedDamage'].value_counts()

notRepairedDamage
nein    259301
ja       34004
Name: count, dtype: int64

In [30]:
# Preenchendo os valores nulos com a moda
# Para isso criaremos um dicionário com os valores que serão preenchidos
valores = {'vehicleType': 'limousine',
           'gearbox': 'manuell',
           'model': 'golf',
           'fuelType': 'benzin',
           'notRepairedDamage': 'nein'}

In [31]:
base = base.fillna(value = valores) # Aplicando a substituição dos valores nulos

In [35]:
base.isnull().sum() # Verificando se ainda existem valores nulos

price                  0
abtest                 0
vehicleType            0
yearOfRegistration     0
gearbox                0
powerPS                0
model                  0
kilometer              0
monthOfRegistration    0
fuelType               0
brand                  0
notRepairedDamage      0
dtype: int64

In [32]:
X = base.iloc[:, 1:12].values # Atributos previsores
X

array([['test', 'limousine', 1993, ..., 'benzin', 'volkswagen', 'nein'],
       ['test', 'coupe', 2011, ..., 'diesel', 'audi', 'ja'],
       ['test', 'suv', 2004, ..., 'diesel', 'jeep', 'nein'],
       ...,
       ['test', 'bus', 1996, ..., 'diesel', 'volkswagen', 'nein'],
       ['test', 'kombi', 2002, ..., 'diesel', 'volkswagen', 'nein'],
       ['control', 'limousine', 2013, ..., 'benzin', 'bmw', 'nein']],
      dtype=object)

In [33]:
y = base.iloc[:, 0].values # Atributo classe (valor do carro)
y

array([  480, 18300,  9800, ...,  9200,  3400, 28990], dtype=int64)

In [34]:
# Aplicando o pré-processamento para transformar as colunas categóricas em colunas numéricas
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), 
                                                 [0, 1, 3, 5, 8, 9, 10] # Índices das colunas categóricas
                                                 )], 
                                  remainder='passthrough' # Mantém as colunas que não foram transformadas
                                  )

In [35]:
X = onehotencoder.fit_transform(X).toarray()
X

array([[0.00e+00, 1.00e+00, 0.00e+00, ..., 0.00e+00, 1.50e+05, 0.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.90e+02, 1.25e+05, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.63e+02, 1.25e+05, 8.00e+00],
       ...,
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.02e+02, 1.50e+05, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.00e+02, 1.50e+05, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, ..., 3.20e+02, 5.00e+04, 8.00e+00]])

In [43]:
base.shape

(359291, 12)

In [36]:
# Criando a rede neural
regressor = Sequential([
    tf.keras.layers.InputLayer(shape = (316,)), # 316 colunas
    tf.keras.layers.Dense(units = 158, activation = 'relu'), # 316 + 1 / 2 = 158
    tf.keras.layers.Dense(units = 158, activation = 'relu'),
    tf.keras.layers.Dense(units = 1, activation = 'linear') # Como é uma previsão de valor, a função de ativação é linear e uma única saída
])

In [37]:
regressor.summary()

In [38]:
regressor.compile(loss = 'mean_absolute_error', optimizer = 'adamw', metrics = ['mean_absolute_error'])

In [39]:
regressor.fit(X, y, batch_size = 300, epochs = 100)

Epoch 1/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 4210.7310 - mean_absolute_error: 4210.7310
Epoch 2/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3378.2725 - mean_absolute_error: 3378.2725
Epoch 3/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3010.4233 - mean_absolute_error: 3010.4233
Epoch 4/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2863.7964 - mean_absolute_error: 2863.7964
Epoch 5/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2805.7100 - mean_absolute_error: 2805.7100
Epoch 6/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2745.9521 - mean_absolute_error: 2745.9521
Epoch 7/100
[1m1198/1198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2717.3865 - mean_absolute_error: 2717.3865
Epoch 8/100
[1m1198

<keras.src.callbacks.history.History at 0x23fc96b2750>

In [44]:
X.shape

(359291, 316)

In [43]:
previsoes = regressor.predict(X) # Utilizando os próprios dados para prever

[1m11228/11228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 956us/step


In [49]:
y

array([  480, 18300,  9800, ...,  9200,  3400, 28990], dtype=int64)

# Resultados obtidos com o AdamW

In [45]:
previsoes

array([[  830.47205],
       [11053.49   ],
       [13441.554  ],
       ...,
       [ 6290.5957 ],
       [ 3210.512  ],
       [27277.705  ]], dtype=float32)

In [51]:
y.mean(), previsoes.mean()

(5916.833945186492, 5488.493)

# Resultados obtidos com o Adam

In [54]:
previsoes

array([[  797.3094],
       [ 9669.138 ],
       [11807.787 ],
       ...,
       [ 4633.4966],
       [ 2291.7415],
       [26168.746 ]], dtype=float32)

In [59]:
y.mean()

5916.833945186492

In [60]:
previsoes.mean()

4795.3066