## classic initialization

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

data = pd.read_csv('../Intro/input/melb_data.csv')

features = ['Type', 'Method', 'Regionname', 'Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'Propertycount']
X = data[features]
y = data.Price

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
numerical_cols = (X_train.dtypes == 'float64')
categorical_cols = (X_train.dtypes == 'object')

numerical_cols = list(numerical_cols[numerical_cols].index)
categorical_cols = list(categorical_cols[categorical_cols].index)

X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
664,h,S,Southern Metropolitan,3,9.2,3104.0,3.0,2.0,368.0,-37.7846,145.0935,7809.0
3270,h,S,Eastern Metropolitan,2,10.5,3081.0,2.0,1.0,586.0,-37.7435,145.0486,2947.0
3873,h,S,Southern Metropolitan,2,11.2,3145.0,2.0,1.0,348.0,-37.8672,145.0432,8801.0
13170,h,S,Northern Metropolitan,3,19.6,3076.0,3.0,1.0,521.0,-37.63854,145.05179,10926.0
1730,h,S,Southern Metropolitan,4,11.4,3163.0,3.0,2.0,687.0,-37.8931,145.0479,7822.0


# 1 - Define Preprocessing Steps

we use the `ColumnTransformer` class to bundle together different preprocessing steps. The code below:

* imputes missing values in numerical data
* imputes missing values and applies a one-hot encoding to categorical data.

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# ? Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# ? Group diffenrent preprocessing treatments together
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# ? call different preprocessing treatments on different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# 2 - Define the Model

In [9]:
model = RandomForestRegressor(n_estimators=100, random_state=0)


# 3 - Create and Evaluate the Pipeline

We use `Pipeline` class to bundles the preprocessing and modeling steps. This will make our workflow much easier.


In [11]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 168099.02716824462
