In [8]:
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from joblib import parallel_backend
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from hummingbird.ml import convert, load

In [9]:
# Cargar datos Ventas
ventas_dataframe = pd.read_csv('ventas_dataframe.csv')
ventas_dataframe

  ventas_dataframe = pd.read_csv('ventas_dataframe.csv')


Unnamed: 0,price_amount,makeid,modelid,versionid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,...,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm,batteryvoltage,batterykwh,chargingtime,chargingtimefast
0,9599,77,314,4411,21625,152720.0,2018,1,41,110,...,48,370,4.2,3.7,3.3,97,-,-,-,-
1,9990,22,438,17597,21920,141485.0,2015,1,46,99,...,60,408,4.4,3.6,3.2,95,-,-,-,-
2,9990,22,438,17597,21920,141485.0,2015,1,30,99,...,60,408,4.4,3.6,3.2,95,-,-,-,-
3,9900,84,624,23187,14900,129000.0,2016,5,28,75,...,45,300,3.5,3.3,3.1,85,-,-,-,-
4,8460,90,1203,86,15060,117000.0,2018,1,46,75,...,40,355,6,4.9,4.3,112,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165749,11700,93,458,24115,-,105000.0,2003,5,3,-,...,-,-,-,-,-,-,-,-,-,-
165750,21300,10,1850,20084,30954,138600.0,2017,1,28,150,...,50,355,6.8,5.7,5,130,-,-,-,-
165751,1050000,35,141,26033,296454,9300.0,2015,5,29,605,...,86,230,19.7,13.3,9.7,307,-,-,-,-
165752,3800,90,653,7327,-,169000.0,2003,5,45,-,...,-,-,-,-,-,-,-,-,-,-


In [4]:
ventas_dataframe.head()

Unnamed: 0,price_amount,makeid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,maxspeed,acceleration,...,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm,batteryvoltage,batterykwh,chargingtime,chargingtimefast
0,9599,77,21625,152720.0,2018,1,41,110,195,11.0,...,48,370,4.2,3.7,3.3,97,-,-,-,-
1,9990,22,21920,141485.0,2015,1,46,99,180,11.5,...,60,408,4.4,3.6,3.2,95,-,-,-,-
2,9990,22,21920,141485.0,2015,1,30,99,180,11.5,...,60,408,4.4,3.6,3.2,95,-,-,-,-
3,9900,84,14900,129000.0,2016,5,28,75,168,14.3,...,45,300,3.5,3.3,3.1,85,-,-,-,-
4,8460,90,15060,117000.0,2018,1,46,75,167,14.7,...,40,355,6.0,4.9,4.3,112,-,-,-,-


In [10]:
ventas_dataframe['manufacturerprice'] = pd.to_numeric(ventas_dataframe['manufacturerprice'], errors='coerce')
ventas_dataframe['horsepower'] = pd.to_numeric(ventas_dataframe['horsepower'], errors='coerce')
ventas_dataframe['maxspeed'] = pd.to_numeric(ventas_dataframe['maxspeed'], errors='coerce')
ventas_dataframe['acceleration'] = pd.to_numeric(ventas_dataframe['acceleration'], errors='coerce')
ventas_dataframe['doors'] = pd.to_numeric(ventas_dataframe['doors'], errors='coerce')
ventas_dataframe['seatingcapacity'] = pd.to_numeric(ventas_dataframe['seatingcapacity'], errors='coerce')
ventas_dataframe['dimensionsinmillimeterswidth'] = pd.to_numeric(ventas_dataframe['dimensionsinmillimeterswidth'], errors='coerce')
ventas_dataframe['dimensionsinmillimetersheight'] = pd.to_numeric(ventas_dataframe['dimensionsinmillimetersheight'], errors='coerce')
ventas_dataframe['dimensionsinmillimeterslength'] = pd.to_numeric(ventas_dataframe['dimensionsinmillimeterslength'], errors='coerce')
ventas_dataframe['weight'] = pd.to_numeric(ventas_dataframe['weight'], errors='coerce')
ventas_dataframe['tankcapacityinliters'] = pd.to_numeric(ventas_dataframe['tankcapacityinliters'], errors='coerce')
ventas_dataframe['trunkcapacityinliters'] = pd.to_numeric(ventas_dataframe['trunkcapacityinliters'], errors='coerce')
ventas_dataframe['consumptionurban'] = pd.to_numeric(ventas_dataframe['consumptionurban'], errors='coerce')
ventas_dataframe['consumptionmixed'] = pd.to_numeric(ventas_dataframe['consumptionmixed'], errors='coerce')
ventas_dataframe['consumptionextraurban'] = pd.to_numeric(ventas_dataframe['consumptionextraurban'], errors='coerce')
ventas_dataframe['co2emissionsgramsperkm'] = pd.to_numeric(ventas_dataframe['co2emissionsgramsperkm'], errors='coerce')
ventas_dataframe['batteryvoltage'] = pd.to_numeric(ventas_dataframe['batteryvoltage'], errors='coerce')
ventas_dataframe['batterykwh'] = pd.to_numeric(ventas_dataframe['batterykwh'], errors='coerce')
ventas_dataframe['chargingtime'] = pd.to_numeric(ventas_dataframe['chargingtime'], errors='coerce')
ventas_dataframe['chargingtimefast'] = pd.to_numeric(ventas_dataframe['chargingtimefast'], errors='coerce')

ventas_dataframe['makeid'] = ventas_dataframe['makeid'].astype('category')
ventas_dataframe['modelid'] = ventas_dataframe['modelid'].astype('category')
ventas_dataframe['versionid'] = ventas_dataframe['versionid'].astype('category')
ventas_dataframe['etiqueta_type_id'] = ventas_dataframe['etiqueta_type_id'].astype('category')
ventas_dataframe['provinceid'] = ventas_dataframe['provinceid'].astype('category')
ventas_dataframe['transmision_type_id'] = ventas_dataframe['transmision_type_id'].astype('category')
ventas_dataframe['body_type_id'] = ventas_dataframe['body_type_id'].astype('category')
ventas_dataframe['combustible_type_id'] = ventas_dataframe['combustible_type_id'].astype('category')
ventas_dataframe['colores_type_id'] = ventas_dataframe['colores_type_id'].astype('category')

In [11]:
# Coches con combustible fosil.
ventas_oil = ventas_dataframe[ventas_dataframe['combustible_type_id'].isin([1, 2, 4, 6, 7])]
ventas_oil = ventas_oil.drop(columns=['batteryvoltage', 'batterykwh', 'chargingtimefast', 'chargingtime'])
ventas_oil


Unnamed: 0,price_amount,makeid,modelid,versionid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,...,dimensionsinmillimeterswidth,dimensionsinmillimetersheight,dimensionsinmillimeterslength,weight,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm
0,9599,77,314,4411,21625.0,152720.0,2018,1,41,110.0,...,1809.0,1485.0,4370.0,1360.0,48.0,370.0,4.2,3.7,3.3,97.0
1,9990,22,438,17597,21920.0,141485.0,2015,1,46,99.0,...,1789.0,1502.0,4329.0,1275.0,60.0,408.0,4.4,3.6,3.2,95.0
2,9990,22,438,17597,21920.0,141485.0,2015,1,30,99.0,...,1789.0,1502.0,4329.0,1275.0,60.0,408.0,4.4,3.6,3.2,95.0
3,9900,84,624,23187,14900.0,129000.0,2016,5,28,75.0,...,1732.0,1448.0,4062.0,1090.0,45.0,300.0,3.5,3.3,3.1,85.0
4,8460,90,1203,86,15060.0,117000.0,2018,1,46,75.0,...,1780.0,1444.0,4059.0,1091.0,40.0,355.0,6.0,4.9,4.3,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165749,11700,93,458,24115,,105000.0,2003,5,3,,...,,,,,,,,,,
165750,21300,10,1850,20084,30954.0,138600.0,2017,1,28,150.0,...,1794.0,1508.0,4191.0,1340.0,50.0,355.0,6.8,5.7,5.0,130.0
165751,1050000,35,141,26033,296454.0,9300.0,2015,5,29,605.0,...,1951.0,1204.0,4571.0,1445.0,86.0,230.0,19.7,13.3,9.7,307.0
165752,3800,90,653,7327,,169000.0,2003,5,45,,...,,,,,,,,,,


In [12]:
print(ventas_oil.dtypes)


price_amount                        int64
makeid                           category
modelid                          category
versionid                        category
manufacturerprice                 float64
km                                float64
vehicleyear                         int64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             

In [13]:
imputer = KNNImputer(n_neighbors=5)
with parallel_backend('threading', n_jobs=4):
    ventas_oil_imputed = pd.DataFrame(imputer.fit_transform(ventas_oil), columns=ventas_oil.columns)
 

In [17]:
ventas_oil_imputed

Unnamed: 0,price_amount,makeid,modelid,versionid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,...,dimensionsinmillimeterswidth,dimensionsinmillimetersheight,dimensionsinmillimeterslength,weight,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm
0,9599.0,77.0,314.0,4411.0,21625.0,152720.0,2018.0,1.0,41.0,110.0,...,1809.0,1485.0,4370.0,1360.0,48.0,370.0,4.20,3.70,3.30,97.0
1,9990.0,22.0,438.0,17597.0,21920.0,141485.0,2015.0,1.0,46.0,99.0,...,1789.0,1502.0,4329.0,1275.0,60.0,408.0,4.40,3.60,3.20,95.0
2,9990.0,22.0,438.0,17597.0,21920.0,141485.0,2015.0,1.0,30.0,99.0,...,1789.0,1502.0,4329.0,1275.0,60.0,408.0,4.40,3.60,3.20,95.0
3,9900.0,84.0,624.0,23187.0,14900.0,129000.0,2016.0,5.0,28.0,75.0,...,1732.0,1448.0,4062.0,1090.0,45.0,300.0,3.50,3.30,3.10,85.0
4,8460.0,90.0,1203.0,86.0,15060.0,117000.0,2018.0,1.0,46.0,75.0,...,1780.0,1444.0,4059.0,1091.0,40.0,355.0,6.00,4.90,4.30,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155921,11700.0,93.0,458.0,24115.0,19820.2,105000.0,2003.0,5.0,3.0,90.0,...,1732.2,1614.2,4103.0,1131.8,46.2,367.6,5.76,4.64,3.98,112.2
155922,21300.0,10.0,1850.0,20084.0,30954.0,138600.0,2017.0,1.0,28.0,150.0,...,1794.0,1508.0,4191.0,1340.0,50.0,355.0,6.80,5.70,5.00,130.0
155923,1050000.0,35.0,141.0,26033.0,296454.0,9300.0,2015.0,5.0,29.0,605.0,...,1951.0,1204.0,4571.0,1445.0,86.0,230.0,19.70,13.30,9.70,307.0
155924,3800.0,90.0,653.0,7327.0,19251.0,169000.0,2003.0,5.0,45.0,106.8,...,1745.0,1514.2,4257.6,1353.4,55.6,346.2,8.46,6.34,5.14,171.0


In [14]:
# Combustible electrico
ventas_elec = ventas_dataframe[ventas_dataframe['combustible_type_id'].isin([3,5])]
ventas_elec

Unnamed: 0,price_amount,makeid,modelid,versionid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,...,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm,batteryvoltage,batterykwh,chargingtime,chargingtimefast
1318,32900,53,2409,5088,37325.0,12.0,2020,4,24,141.0,...,37.0,291.0,,,,31.0,36.0,8.9,,
4208,17500,105,1099,20589,38012.0,78500.0,2017,4,28,136.0,...,,341.0,,,,,323.0,35.8,1020.0,45.0
4329,19990,105,1099,20589,38012.0,43990.0,2017,4,28,136.0,...,,341.0,,,,,323.0,35.8,1020.0,45.0
4478,16990,105,1099,20589,38012.0,116000.0,2017,4,28,136.0,...,,341.0,,,,,323.0,35.8,1020.0,45.0
4512,18990,105,1099,20589,38012.0,139000.0,2017,4,28,136.0,...,,341.0,,,,,323.0,35.8,1020.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165502,60000,78,89,27590,35600.0,90000.0,2021,4,8,180.0,...,40.0,548.0,,,,,,12.4,420.0,
165623,48650,102,591,8171,52150.0,,2023,3,28,223.0,...,43.0,358.0,,,,,216.0,13.8,,
165631,48900,101,1619,28267,49180.0,8900.0,2023,4,8,347.0,...,,,,,,,,,,
165655,25200,105,1099,22222,40250.0,96000.0,2017,4,3,204.0,...,40.0,272.0,,,,38.0,352.0,8.7,210.0,150.0


In [30]:
print(ventas_elec.dtypes)

price_amount                        int64
makeid                           category
manufacturerprice                 float64
km                                float64
vehicleyear                         int64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             float64
consumptionurban                  float64
consumptionmixed                  

In [15]:
with parallel_backend('threading', n_jobs=4):
    ventas_elec_imputed = pd.DataFrame(imputer.fit_transform(ventas_elec), columns=ventas_elec.columns)

In [33]:
ventas_elec_imputed

Unnamed: 0,price_amount,makeid,manufacturerprice,km,vehicleyear,etiqueta_type_id,provinceid,horsepower,maxspeed,acceleration,...,tankcapacityinliters,trunkcapacityinliters,consumptionurban,consumptionmixed,consumptionextraurban,co2emissionsgramsperkm,batteryvoltage,batterykwh,chargingtime,chargingtimefast
0,32900.0,53.0,37325.0,12.0,2020.0,4.0,24.0,141.0,160.0,11.0,...,37.0,291.0,3.28,1.58,3.40,31.0,36.0,8.90,1536.0,29.4
1,17500.0,105.0,38012.0,78500.0,2017.0,4.0,28.0,136.0,150.0,9.6,...,36.8,341.0,2.46,1.38,2.72,49.0,323.0,35.80,1020.0,45.0
2,19990.0,105.0,38012.0,43990.0,2017.0,4.0,28.0,136.0,150.0,9.6,...,38.0,341.0,2.54,1.42,2.98,39.0,323.0,35.80,1020.0,45.0
3,16990.0,105.0,38012.0,116000.0,2017.0,4.0,28.0,136.0,150.0,9.6,...,37.4,341.0,2.42,1.44,2.46,45.0,323.0,35.80,1020.0,45.0
4,18990.0,105.0,38012.0,139000.0,2017.0,4.0,28.0,136.0,150.0,9.6,...,41.8,341.0,2.80,1.08,3.14,30.2,323.0,35.80,1020.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9760,60000.0,78.0,35600.0,90000.0,2021.0,4.0,8.0,180.0,225.0,7.7,...,40.0,548.0,3.08,1.36,2.98,38.0,139.6,12.40,420.0,38.0
9761,48650.0,102.0,52150.0,22034.4,2023.0,3.0,28.0,223.0,180.0,7.2,...,43.0,358.0,3.40,1.24,3.40,35.2,216.0,13.80,2202.0,31.6
9762,48900.0,101.0,49180.0,8900.0,2023.0,4.0,8.0,347.0,217.0,6.9,...,44.4,465.8,3.40,1.28,3.40,38.6,319.0,67.62,2064.6,26.6
9763,25200.0,105.0,40250.0,96000.0,2017.0,4.0,3.0,204.0,222.0,7.6,...,40.0,272.0,2.46,1.80,2.72,38.0,352.0,8.70,210.0,150.0


In [16]:
ventas_oil_imputed.to_csv('ventas_oil_imputed.csv', index=False)
ventas_elec_imputed.to_csv('ventas_elec_imputed.csv', index=False)