# PIPELINES

In [9]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


In [2]:
data=pd.read_csv('./melb_data.csv')

#Variables predictoras y variable objetivo
y=data['Price']
X=data.drop(['Price'],axis=1)

In [10]:
X_train_full,X_valid_full, y_train,y_valid=train_test_split(X,y,test_size=0.20,random_state=0)

low_cardinality=[c for c in X_train_full.columns if X_train_full[c].nunique()<10 and X_train_full[c].dtype=='object']
numericals=[c for c in X_train_full.columns if X_train_full[c].dtype in ['int64','float64']]

mis_columnas=low_cardinality+numericals
X_train=X_train_full[mis_columnas]
X_valid=X_valid_full[mis_columnas]

## STEP 1: Preprocessing

we use the ColumnTransformer class to bundle together different preprocessing steps. The code below:

- Imputes missing values in numerical data, and
- Imputes missing values and applies a one-hot encoding to categorical data.

In [6]:
#Preprocesando valores numericos (simpleimputes los sustituye por la media de la columna donde se encuentra)
numerical_imputer=SimpleImputer(strategy='constant')

#Valores categoricos.
categorical=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')), #Rellena los valores faltantes con el mas repetido
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

#Juntamos todo
preprocesor=ColumnTransformer(transformers=[
    ('num',numerical_imputer,numericals),
    ('cat',categorical,low_cardinality)
])

## STEP 2: Define the model

In [8]:
model=RandomForestRegressor(n_estimators=100,random_state=0)

## STEP 3: Create and evaluate the pipeline

In [11]:
my_pipeline=Pipeline(steps=[
    ('preprocesor',preprocesor),
    ('model',model)
])

#Entrenamiento
my_pipeline.fit(X_train,y_train)

#test
y_pred=my_pipeline.predict(X_valid)

mean_absolute_error(y_valid,y_pred), mean_absolute_percentage_error(y_valid,y_pred)

(160679.18917034855, 0.14862435619286685)