In [41]:
import pandas as pd 
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error




In [42]:
# Pipeline is contructed in three ways
# 1. define preprocessing steps
# 2. Define the model 
# 3. Create and evaluate the pipeline

In [43]:
if __name__ == "__main__":
    data = pd.read_csv("melb_data.csv")
    
    y = data.Price
    X = data.drop(['Price'],axis = 1)
    
    X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)
                          
    
    # "Cardinality" means the number of unique values in a column
    # Select categorical columns with relatively low cardinality (convenient but arbitrary)
    categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                            X_train_full[cname].dtype == "object"]

    # Select numerical columns
    numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
    
    my_cols = categorical_cols + numerical_cols
    X_train = X_train_full[my_cols].copy()
    X_valid = X_valid_full[my_cols].copy()
                                                                    
    
    # Preprocessing for neumerical data
    neumerical_transformer = SimpleImputer(strategy = 'constant')
    
    # Preprocessing for catagorical data
    catagorical_transformer = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown = 'ignore'))
    ]
    )
    
    # Bundling preprocessing for neumerical and catagorical data 
    preprocessor = ColumnTransformer(
    transformers = [
        ('num',neumerical_transformer,numerical_cols),
        ('cat',catagorical_transformer,categorical_cols)
    ]
    )
    
    # Now Defining the model
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    
    # create and evaluate the pipeline
    
    # Bundle preprocessing and model in pipeline
    my_pipeline = Pipeline(steps=[
        ('preprocessing',preprocessor),
        ('model',model)
    ])

    # preprocessing of traing data and fit model
    my_pipeline.fit(X_train,y_train)
    
    # preprocessing of validation data , and get preds
    preds = my_pipeline.predict(X_valid)
    
    # Evaluate the model 
    score = mean_absolute_error(y_valid,preds)
    print("MAE : ",score)

MAE :  160679.18917034855
