# <p style="background: #F58A07; color:white; font-size:250%; text-align:Center; border-radius: 20px 100px;"> 💸 Precios de automoviles usados 💹 </p>


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🔎 Análisis Exploratorio de Datos - EDA </p>

In [1]:
# Librerias iniciales
import pandas as pd 

# Carga del dataset
cars = pd.read_csv('data/CarsUK.csv')

# primeros 10 reguistros del dataset
cars.head()

Unnamed: 0,maker,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,cclass,C Class,2020,Automatic,1200,Diesel,,,2.0,30495
1,cclass,C Class,2020,Automatic,1000,Petrol,,,1.5,29989
2,cclass,C Class,2020,Automatic,500,Diesel,,,2.0,37899
3,cclass,C Class,2019,Automatic,5000,Diesel,,,2.0,30399
4,cclass,C Class,2019,Automatic,4500,Diesel,,,2.0,29899


In [2]:
from pandas_profiling import ProfileReport

profile = ProfileReport(cars, title='Análisis del conjunto de datos de automóviles en bruto', explorative=True)

In [3]:
# Primeros pasos en el dataset, brinda una extensa información sobre el dataset, como la cantidad de registros, la cantidad de columnas, etc.
# Tomando como punto de partida el reporte anterior, se puede decidir que hacer con los datos que se encuentran disponibles.

profile.to_file('reporte_cars.html')

Summarize dataset: 100%|██████████| 61/61 [00:13<00:00,  4.46it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 95.86it/s]


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🔮 Feature engineering </p>

In [4]:
# En el reporte se destaca que existen datos duplicados en el dataset. 

# Mostramos la cantidad de registros
len(cars)

108540

In [5]:
# Mostramos la cantidad de registros duplicados
cars.duplicated().sum()

2273

In [6]:
# Eliminamos los registros duplicados
cars = cars.drop_duplicates(keep='first')

In [7]:
# Mostramos la cantidad de registros (actualizado)
len(cars)

106267

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🪓 Dividir el dataset </p>

In [8]:
# Librerias para dividir mejor el dataset
from sklearn.model_selection import train_test_split

import numpy as np

In [9]:

rest, test = train_test_split(cars, test_size=0.2, shuffle=True) # 20% de 100 = 20

train, val = train_test_split(rest, test_size=0.25,shuffle=True ) # 25% de 80 = 20

distributions = np.array([len(train), len(val), len(test)])

print(distributions)
print(distributions/ len(cars))

[63759 21254 21254]
[0.59998871 0.20000565 0.20000565]


In [10]:
# ??cars.drop_duplicates

In [11]:
## Cambiamos las variables categoricas a numericas mediante dummies/
# Prueba en los valores maker == Fabricante 

# Valores originales
train[['maker']]

Unnamed: 0,maker
46489,ford
3921,focus
106720,merc
62995,vauxhall
71019,bmw
...,...
74749,bmw
84377,vw
105395,merc
44611,ford


In [12]:
# Valores dummies
pd.get_dummies(train[['maker']])

# Aqui se dividen los valores originales en valores dummies (1 y 0) pero este no guarda el estado y mostramos una alternativa

Unnamed: 0,maker_audi,maker_bmw,maker_cclass,maker_focus,maker_ford,maker_hyundi,maker_merc,maker_skoda,maker_toyota,maker_vauxhall,maker_vw
46489,0,0,0,0,1,0,0,0,0,0,0
3921,0,0,0,1,0,0,0,0,0,0,0
106720,0,0,0,0,0,0,1,0,0,0,0
62995,0,0,0,0,0,0,0,0,0,1,0
71019,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
74749,0,1,0,0,0,0,0,0,0,0,0
84377,0,0,0,0,0,0,0,0,0,0,1
105395,0,0,0,0,0,0,1,0,0,0,0
44611,0,0,0,0,1,0,0,0,0,0,0


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> ✖️ One hot enconding </p>

In [13]:
# Libreria para cambiar las variables categoricas a numericas
from sklearn.preprocessing import OneHotEncoder

In [14]:
# Se crea un objeto encoder
make_encoder = OneHotEncoder()

In [15]:
# Se le pasa el dataset en la variable train
make_encoder.fit(train[['maker']])


OneHotEncoder()

In [16]:
# Matriz de valores al estilo dummies 
# mkr = make_encoder.transform(train[['maker']]) Original
mkr = make_encoder.transform(train[['maker']]).todense() # Se transforma a una matriz densa
mkr

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.]])

In [17]:
# Categorias dentro de la matriz del encoder 
make_encoder.categories_

[array(['audi', 'bmw', 'cclass', 'focus', 'ford', 'hyundi', 'merc',
        'skoda', 'toyota', 'vauxhall', 'vw'], dtype=object)]

### pandas.get_dummies() vs encode()

In [18]:
# En este ejemplo rapido se puede ver el funcionamiento segun las matrices de valores y seleccionar la que mejor se adapte a los datos
test_maker = 'audi'

In [19]:
pd.get_dummies(test_maker)

Unnamed: 0,audi
0,1


In [20]:
make_encoder.transform([[test_maker]]).todense()



matrix([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

"Prefiero que el modelo falle a que realice una predicción incorrecta" - Antonio Feregrino

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 📐 Feature Scaling </p>

In [21]:
# Evitar que el modelo asigne mayor prioridad a una categoria que otra
# Comprime valores mas grandes para que todos se encuentren en un rango similar 

# En sklearn exiten herramientas que permiten realizar esto de manera eficiente, mostrando variantes para cada caso
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler

In [22]:
# ejemplo con minmax
scaler = MinMaxScaler()

In [23]:
scaler.fit(train[['mileage']])

MinMaxScaler()

In [24]:
scaled = scaler.transform(train[['mileage']])

In [25]:
scaled

array([[0.14541531],
       [0.08642442],
       [0.10330373],
       ...,
       [0.18835972],
       [0.062768  ],
       [0.01857281]])

In [26]:
# Demostracion de la transformacion
values = pd.DataFrame({'mileage': train['mileage'].values, 'scaled':scaled.squeeze()})

In [27]:
values.sample(5)

Unnamed: 0,mileage,scaled
40878,3500,0.010833
36178,31815,0.098496
5941,47364,0.146635
33617,13202,0.04087
48047,1165,0.003604


In [28]:
values.describe()

Unnamed: 0,mileage,scaled
count,63759.0,63759.0
mean,23256.570539,0.071999
std,21236.775936,0.065749
min,1.0,0.0
25%,7764.5,0.024036
50%,17562.0,0.054369
75%,32448.0,0.100455
max,323000.0,1.0


## Ejemplo con dos escaladores

In [29]:
escalador = MinMaxScaler()
escalador2 = RobustScaler()

escalador.fit(train[['mileage']])
escalador2.fit(train[['mileage']])

escalador = escalador.transform(train[['mileage']])
escalador2 = escalador2.transform(train[['mileage']])


In [30]:
valores = pd.DataFrame({'mileage': train['mileage'].values, 'MinMax':escalador.squeeze(), 'Robust':escalador2.squeeze()})
valores.sample(5)

Unnamed: 0,mileage,MinMax,Robust
38509,31586,0.097787,0.568153
56510,52538,0.162654,1.416979
11934,4888,0.01513,-0.51346
48994,49552,0.153409,1.296007
61286,6770,0.020957,-0.437215


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> ⚙️ Artefactos </p>


Herramientas que nos ayudan a convertir una observacion real en una predicción.
Se recomienda ampliamente documentar los serializadores empleados al momento de general el modelo para tener puntos de referencia si algo falla al momentos de liberar en producción o al momento de replicar. 

In [31]:
# Libreria por excelencia para los serializadores
import pickle

with open('scaled.pickle', 'wb') as wb:
    pickle.dump(scaled, wb)
    
with open('make_encoder.pickle', 'wb') as wb:
    pickle.dump(make_encoder, wb)

In [32]:
# Se pueden cargar los serializadores en otro notebook / script para testear la ejecucion

import pickle

with open('make_encoder.pickle', 'rb') as rb:
    make_encoder = pickle.load(rb)

In [33]:
# test del escalador
make_encoder.categories_

[array(['audi', 'bmw', 'cclass', 'focus', 'ford', 'hyundi', 'merc',
        'skoda', 'toyota', 'vauxhall', 'vw'], dtype=object)]

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🔃 Pipelines </p>

Permite crear una secuencia de transformaciones de datos.
para poder replicar en produccion de manera sencilla y eficiente.

In [34]:
# Entre las librerias a utilizar se recomiendan las siguientes gracias a sklearn

from sklearn import set_config

from sklearn.pipeline import Pipeline

from sklearn.pipeline import FeatureUnion

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import RobustScaler, MinMaxScaler


In [45]:
# onehotencode maker, transmission y fuelType

one_hot_encode = ColumnTransformer([(
    # Nombre de la transformacion
    #                 # La transformacion
    #                                               Coumnas que van a sufrir la transmision
    'maker-transmission-fuelType', OneHotEncoder(sparse=False), ['maker', 'transmission', 'fuelType']
)])

In [46]:
one_hot_encode.fit(train)

In [47]:
test = one_hot_encode.transform(train)
test.shape

(63759, 20)

In [38]:
test

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
# Robust encode mileage

robus_encoding = ColumnTransformer([(
    'mileage', RobustScaler(), ['mileage']
)])

In [49]:
# impute and standard scale mpg and tax
impute_and_scale = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())])

In [50]:
standard_scaling = ColumnTransformer([('mpg-tax', impute_and_scale, ['mpg', 'tax'])])

In [51]:
passthrough = ColumnTransformer([('pass', 'passthrough', ['year', 'engineSize'])])

In [52]:
# Ensamblando el Pipeline

pipe = Pipeline([(
    'features', FeatureUnion([
        ('one_hot_encode', one_hot_encode),
        ('robust_encoding', robus_encoding), 
        ('just_pass', passthrough),
        ('scale_and_impute', standard_scaling)
    ])
)])

In [53]:
from sklearn import set_config

set_config(display='diagram')
pipe

In [54]:
pipe.fit(train)

In [55]:
train_x = pipe.transform(train)
train_x.shape

(63759, 25)

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🗳️ Modelado </p>


In [57]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [58]:
predic_pipeline = Pipeline([
    ('feature_engineering', pipe),
    ('calculadora_precios', lr)
])

In [59]:
set_config(display='diagram')
predic_pipeline

In [60]:
_ = predic_pipeline.fit(train, train['price'])

In [61]:
train_pred = predic_pipeline.predict(train)
val_pred = predic_pipeline.predict(val)

In [62]:
# Dataframe de salida para verificar los datos
pd.DataFrame({'real':val['price'], 'predicted':val_pred})

Unnamed: 0,real,predicted
84398,10046,11003.75
80950,10500,11730.50
86050,22495,17809.25
82734,16495,17657.00
97881,29782,29174.75
...,...,...
30187,17995,18800.75
11654,15790,15731.50
25592,10695,17122.50
58347,13000,12159.50


## <p style="background-color:#e5e5e5; color:black; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 10px 10px;"> 🚫 Evaluar el modelo </p>



In [63]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [68]:
train_mse = mean_absolute_error(train['price'], train_pred)
val_mse = mean_absolute_error(val['price'], val_pred)

print(f'''
      Entrenamiento MSE: {train_mse:2.02f}
      Validacion MSE: {val_mse:2.02f}''')


      Entrenamiento MSE: 2922.32
      Validacion MSE: 2950.35


## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 💾 Guardar Pipeline </p>


In [69]:
from joblib import dump, load

dump(predic_pipeline, 'car-prices.model')

['car-prices.model']

## <p style="background-color:#ffb703; color:black; font-family:newtimeroman; font-size:200%; text-align:left; border-radius: 10px 10px;"> 🧪 Probar el modelo </p>

Se puede utilizar un nuevo notebook, por comodidad, se cargara en este mismo


In [70]:
# Para cargar el modelo que guardamos 
from joblib import load
# Para el ejercicio, crearemos un dataframe con pandas
import pandas as pd 

In [72]:
# Cargamos nuestro modelo en una variable
saved_pipeline = load('car-prices.model')

In [73]:
# Datos falsos para poder probar el modelo que se creo
maker='ford'
model='focus'
year=2020
transmission='Manual'
mileage=50
fuelType='Petrol'
tax=100
mpg=30
engineSize=1.5

carrito_prueba = pd.DataFrame({
    'maker':[maker], 'model':[model], 'year':[year], 'transmission':[transmission], 'mileage':[mileage], 'fuelType':[fuelType], 'tax':[tax], 'mpg':[mpg], 'engineSize':[engineSize]
})


In [74]:
carrito_prueba

Unnamed: 0,maker,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,ford,focus,2020,Manual,50,Petrol,100,30,1.5


In [76]:
price = saved_pipeline.predict(carrito_prueba).squeeze()

print(f'El precio estimado de tu auto es de $ {price:2.02f}')

El precio estimado de tu auto es de $ 22252.50
