In [1]:
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from joblib import parallel_backend
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from hummingbird.ml import convert, load

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
ventas_oil = pd.read_csv('ventas_oil_imputed.csv')
ventas_elec = pd.read_csv('ventas_elec_imputed.csv')

In [3]:
ventas_oil.dtypes


price_amount                     float64
makeid                           float64
manufacturerprice                float64
km                               float64
vehicleyear                      float64
etiqueta_type_id                 float64
provinceid                       float64
horsepower                       float64
maxspeed                         float64
acceleration                     float64
combustible_type_id              float64
body_type_id                     float64
transmision_type_id              float64
doors                            float64
seatingcapacity                  float64
colores_type_id                  float64
dimensionsinmillimeterswidth     float64
dimensionsinmillimetersheight    float64
dimensionsinmillimeterslength    float64
weight                           float64
tankcapacityinliters             float64
trunkcapacityinliters            float64
consumptionurban                 float64
consumptionmixed                 float64
consumptionextra

In [9]:
ventas_oil['makeid'] = ventas_oil['makeid'].astype('category')
ventas_oil['etiqueta_type_id'] = ventas_oil['etiqueta_type_id'].astype('category')
ventas_oil['provinceid'] = ventas_oil['provinceid'].astype('category')
ventas_oil['transmision_type_id'] = ventas_oil['transmision_type_id'].astype('category')
ventas_oil['body_type_id'] = ventas_oil['body_type_id'].astype('category')
ventas_oil['combustible_type_id'] = ventas_oil['combustible_type_id'].astype('category')
ventas_oil['colores_type_id'] = ventas_oil['colores_type_id'].astype('category')
ventas_oil['lPrice'] = np.log10(ventas_oil['price_amount'])

In [10]:
ventas_oil.dtypes

price_amount                      float64
makeid                           category
manufacturerprice                 float64
km                                float64
vehicleyear                       float64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             float64
consumptionurban                  float64
consumptionmixed                  

In [6]:
ventas_elec.dtypes

price_amount                     float64
makeid                           float64
manufacturerprice                float64
km                               float64
vehicleyear                      float64
etiqueta_type_id                 float64
provinceid                       float64
horsepower                       float64
maxspeed                         float64
acceleration                     float64
combustible_type_id              float64
body_type_id                     float64
transmision_type_id              float64
doors                            float64
seatingcapacity                  float64
colores_type_id                  float64
dimensionsinmillimeterswidth     float64
dimensionsinmillimetersheight    float64
dimensionsinmillimeterslength    float64
weight                           float64
tankcapacityinliters             float64
trunkcapacityinliters            float64
consumptionurban                 float64
consumptionmixed                 float64
consumptionextra

In [11]:
ventas_elec['makeid'] = ventas_elec['makeid'].astype('category')
ventas_elec['etiqueta_type_id'] = ventas_elec['etiqueta_type_id'].astype('category')
ventas_elec['provinceid'] = ventas_elec['provinceid'].astype('category')
ventas_elec['transmision_type_id'] = ventas_elec['transmision_type_id'].astype('category')
ventas_elec['body_type_id'] = ventas_elec['body_type_id'].astype('category')
ventas_elec['combustible_type_id'] = ventas_elec['combustible_type_id'].astype('category')
ventas_elec['colores_type_id'] = ventas_elec['colores_type_id'].astype('category')
ventas_elec['lPrice'] = np.log10(ventas_elec['price_amount'])

In [12]:
ventas_elec.dtypes

price_amount                      float64
makeid                           category
manufacturerprice                 float64
km                                float64
vehicleyear                       float64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             float64
consumptionurban                  float64
consumptionmixed                  

In [15]:
# Separamos en training y test
ventas_oil_X = ventas_oil.drop(columns=['price_amount', 'lPrice'])
ventas_oil_y = ventas_oil['lPrice']
ventas_oil_X_train, ventas_oil_X_test, ventas_oil_y_train, ventas_oil_y_test = train_test_split(ventas_oil_X, ventas_oil_y, test_size=0.5, random_state=123)

ventas_elec_X = ventas_elec.drop(columns=['price_amount', 'lPrice'])
ventas_elec_y = ventas_elec['lPrice']
ventas_elec_X_train, ventas_elec_X_test, ventas_elec_y_train, ventas_elec_y_test = train_test_split(ventas_elec_X, ventas_elec_y, test_size=0.5, random_state=123)


In [16]:
# Crear objetos validacion cruzada para trial, tuning y assessment
rk_trial = RepeatedKFold(n_splits=5, n_repeats=1, random_state=123)
rk_tuning = RepeatedKFold(n_splits=5, n_repeats=1, random_state=456)
rk_assessment = RepeatedKFold(n_splits=5, n_repeats=1, random_state=789)

for train_index, test_index in kf.split(ventas_oil_X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = ventas_oil_X_train.iloc[train_index], ventas_oil_X_train.iloc[test_index]
    y_train, y_test = ventas_oil_y_train.iloc[train_index], ventas_oil_y_train.iloc[test_index]



TRAIN: [    0     1     3 ... 77958 77960 77961] TEST: [    2     7     9 ... 77948 77959 77962]
TRAIN: [    0     2     3 ... 77960 77961 77962] TEST: [    1     5    12 ... 77953 77955 77958]
TRAIN: [    0     1     2 ... 77960 77961 77962] TEST: [    8    19    21 ... 77947 77950 77956]
TRAIN: [    0     1     2 ... 77959 77960 77962] TEST: [    3    14    16 ... 77941 77954 77961]
TRAIN: [    1     2     3 ... 77959 77961 77962] TEST: [    0     4     6 ... 77952 77957 77960]
