# <p style="background: #F58A07; color:white; font-size:250%; text-align:Center; border-radius: 20px 100px;"> 💸 Precios de automoviles usados 💹 </p>


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🔎 Análisis Exploratorio de Datos - EDA </p>

In [4]:
# Librerias iniciales
import pandas as pd 

# Carga del dataset
cars = pd.read_csv('data/CarsUK.csv')

# primeros 10 reguistros del dataset
cars.head()

Unnamed: 0,maker,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,cclass,C Class,2020,Automatic,1200,Diesel,,,2.0,30495
1,cclass,C Class,2020,Automatic,1000,Petrol,,,1.5,29989
2,cclass,C Class,2020,Automatic,500,Diesel,,,2.0,37899
3,cclass,C Class,2019,Automatic,5000,Diesel,,,2.0,30399
4,cclass,C Class,2019,Automatic,4500,Diesel,,,2.0,29899


In [5]:
from pandas_profiling import ProfileReport

profile = ProfileReport(cars, title='Análisis del conjunto de datos de automóviles en bruto', explorative=True)

In [6]:
# Primeros pasos en el dataset, brinda una extensa información sobre el dataset, como la cantidad de registros, la cantidad de columnas, etc.
# Tomando como punto de partida el reporte anterior, se puede decidir que hacer con los datos que se encuentran disponibles.

profile.to_file('reporte_cars.html')

Summarize dataset: 100%|██████████| 61/61 [00:11<00:00,  5.14it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 90.21it/s]


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🔮 Feature engineering </p>

In [7]:
# En el reporte se destaca que existen datos duplicados en el dataset. 

# Mostramos la cantidad de registros
len(cars)

108540

In [8]:
# Mostramos la cantidad de registros duplicados
cars.duplicated().sum()

2273

In [9]:
# Eliminamos los registros duplicados
cars = cars.drop_duplicates(keep='first')

In [10]:
# Mostramos la cantidad de registros (actualizado)
len(cars)

106267

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🪓 Dividir el dataset </p>

In [11]:
# Librerias para dividir mejor el dataset
from sklearn.model_selection import train_test_split

import numpy as np

In [12]:

rest, test = train_test_split(cars, test_size=0.2, shuffle=True) # 20% de 100 = 20

train, val = train_test_split(rest, test_size=0.25,shuffle=True ) # 25% de 80 = 20

distributions = np.array([len(train), len(val), len(test)])

print(distributions)
print(distributions/ len(cars))

[63759 21254 21254]
[0.59998871 0.20000565 0.20000565]


In [13]:
# ??cars.drop_duplicates

In [14]:
## Cambiamos las variables categoricas a numericas mediante dummies/
# Prueba en los valores maker == Fabricante 

# Valores originales
train[['maker']]

Unnamed: 0,maker
17610,audi
16846,audi
47318,ford
95971,merc
103832,merc
...,...
95233,hyundi
12809,audi
3657,cclass
41962,ford


In [15]:
# Valores dummies
pd.get_dummies(train[['maker']])

# Aqui se dividen los valores originales en valores dummies (1 y 0) pero este no guarda el estado y mostramos una alternativa

Unnamed: 0,maker_audi,maker_bmw,maker_cclass,maker_focus,maker_ford,maker_hyundi,maker_merc,maker_skoda,maker_toyota,maker_vauxhall,maker_vw
17610,1,0,0,0,0,0,0,0,0,0,0
16846,1,0,0,0,0,0,0,0,0,0,0
47318,0,0,0,0,1,0,0,0,0,0,0
95971,0,0,0,0,0,0,1,0,0,0,0
103832,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95233,0,0,0,0,0,1,0,0,0,0,0
12809,1,0,0,0,0,0,0,0,0,0,0
3657,0,0,1,0,0,0,0,0,0,0,0
41962,0,0,0,0,1,0,0,0,0,0,0


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> ✖️ One hot enconding </p>

In [16]:
# Libreria para cambiar las variables categoricas a numericas
from sklearn.preprocessing import OneHotEncoder

In [17]:
# Se crea un objeto encoder
make_encoder = OneHotEncoder()

In [18]:
# Se le pasa el dataset en la variable train
make_encoder.fit(train[['maker']])


OneHotEncoder()

In [19]:
# Matriz de valores al estilo dummies 
# mkr = make_encoder.transform(train[['maker']]) Original
mkr = make_encoder.transform(train[['maker']]).todense() # Se transforma a una matriz densa
mkr

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.]])

In [20]:
# Categorias dentro de la matriz del encoder 
make_encoder.categories_

[array(['audi', 'bmw', 'cclass', 'focus', 'ford', 'hyundi', 'merc',
        'skoda', 'toyota', 'vauxhall', 'vw'], dtype=object)]

### pandas.get_dummies() vs encode()

In [21]:
# En este ejemplo rapido se puede ver el funcionamiento segun las matrices de valores y seleccionar la que mejor se adapte a los datos
test_maker = 'audi'

In [22]:
pd.get_dummies(test_maker)

Unnamed: 0,audi
0,1


In [23]:
make_encoder.transform([[test_maker]]).todense()



matrix([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

"Prefiero que el modelo falle a que realice una predicción incorrecta" - Antonio Feregrino

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 📐 Feature Scaling </p>

In [24]:
# Evitar que el modelo asigne mayor prioridad a una categoria que otra
# Comprime valores mas grandes para que todos se encuentren en un rango similar 

# En sklearn exiten herramientas que permiten realizar esto de manera eficiente, mostrando variantes para cada caso
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler

In [25]:
# ejemplo con minmax
scaler = MinMaxScaler()

In [26]:
scaler.fit(train[['mileage']])

MinMaxScaler()

In [27]:
scaled = scaler.transform(train[['mileage']])

In [28]:
scaled

array([[0.12511803],
       [0.2260038 ],
       [0.10798176],
       ...,
       [0.15296642],
       [0.0316162 ],
       [0.126375  ]])

In [29]:
# Demostracion de la transformacion
values = pd.DataFrame({'mileage': train['mileage'].values, 'scaled':scaled.squeeze()})

In [30]:
values.sample(5)

Unnamed: 0,mileage,scaled
15521,771,0.002384
41576,9135,0.028279
24616,9334,0.028895
44705,3000,0.009285
56973,4891,0.015139


In [31]:
values.describe()

Unnamed: 0,mileage,scaled
count,63759.0,63759.0
mean,23256.84948,0.072
std,21239.0074,0.065756
min,1.0,0.0
25%,7718.0,0.023892
50%,17515.0,0.054223
75%,32479.0,0.100551
max,323000.0,1.0


## Ejemplo con dos escaladores

In [32]:
escalador = MinMaxScaler()
escalador2 = RobustScaler()

escalador.fit(train[['mileage']])
escalador2.fit(train[['mileage']])

escalador = escalador.transform(train[['mileage']])
escalador2 = escalador2.transform(train[['mileage']])


In [33]:
valores = pd.DataFrame({'mileage': train['mileage'].values, 'MinMax':escalador.squeeze(), 'Robust':escalador2.squeeze()})
valores.sample(5)

Unnamed: 0,mileage,MinMax,Robust
7225,24386,0.075496,0.277493
35606,18549,0.057424,0.041759
21119,11330,0.035074,-0.249788
53810,30490,0.094393,0.52401
46554,11923,0.03691,-0.225839


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> ⚙️ Artefactos </p>


Herramientas que nos ayudan a convertir una observacion real en una predicción.
Se recomienda ampliamente documentar los serializadores empleados al momento de general el modelo para tener puntos de referencia si algo falla al momentos de liberar en producción o al momento de replicar. 

In [34]:
# Libreria por excelencia para los serializadores
import pickle

with open('scaled.pickle', 'wb') as wb:
    pickle.dump(scaled, wb)
    
with open('make_encoder.pickle', 'wb') as wb:
    pickle.dump(make_encoder, wb)

In [41]:
# Se pueden cargar los serializadores en otro notebook / script para testear la ejecucion

import pickle

with open('make_encoder.pickle', 'rb') as rb:
    make_encoder = pickle.load(rb)

In [45]:
# test del escalador
make_encoder.categories_

[array(['audi', 'bmw', 'cclass', 'focus', 'ford', 'hyundi', 'merc',
        'skoda', 'toyota', 'vauxhall', 'vw'], dtype=object)]

## <p style="background-color:#e5e5e5; color:black; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 10px 10px;"> 📨 txt </p>

