In [None]:
#The benefit of using pipeline is the fact that we don't have to spend alot of time in preprocessing the data (depending on the type of 
#data we are dealing with. We can simply merge different preprocessors with one line of code)

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
#data used 
data = pd.read_csv("/content/melb_data.csv")

In [None]:
X = data.drop(['Price'], axis = 1)
y = data.Price

In [None]:
X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(X,y, train_size= 0.8, test_size = 0.2, random_state = 1) 

In [None]:
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() <10]
categorical_cols

['Type', 'Method', 'Regionname']

In [None]:
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [None]:
my_cols = numerical_cols + categorical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
#Preprocess numerical data
numerical_transformer = SimpleImputer(strategy = 'mean')

#Preprocess categorical data
Categorical_transformer = Pipeline(steps = [('Imputer', SimpleImputer(strategy='most_frequent')),
                                            ('OHE', OneHotEncoder(handle_unknown= 'ignore', sparse= False))])

#Bundling up the two transformers using ColumnTransfomer function
preprocessor = ColumnTransformer(transformers= [('num_trans', numerical_transformer, numerical_cols),
                                                ('cat_trans', Categorical_transformer, categorical_cols)])

In [None]:
#Defining the model as usual
model = RandomForestRegressor(n_estimators=100, random_state= 0)

In [None]:
#creating final pipeline which will bundle up the model and the preprocessor
mypipeline = Pipeline(steps = [('preprocessor', preprocessor), ('model', model)])

In [None]:
mypipeline.fit(X_train, y_train_full)
pred = mypipeline.predict(X_valid)


In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
error = mean_absolute_error(y_valid_full, pred)
error

156298.1488635248