# Pipelines
Okay - we can do some ML, cool? But we are software engineers, not pesky data scientists. Lets make this readable, maintainable and reusable. Pipelines are hwo we do that

In [2]:
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# Load data
training_data_file_path = ('./data/4_housing_competition/train.csv')
home_data = pd.read_csv(training_data_file_path)

y = home_data.SalePrice

feature_names = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'
]

X = home_data[feature_names]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

# Define our pipeline
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and
                    X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                X_train[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

good_label_cols = []
for col in categorical_cols:
    validation_values = set(X_valid[col])
    training_values = set(X_train[col])
    if validation_values.issubset(training_values):
        good_label_cols.append(col)

# Filter ALL datasets to only include good categorical columns + numerical columns
columns_to_keep = numerical_cols + good_label_cols

# Whenever we filter the training data, we must also filter the validation data (and the test data if we had some)
X_train_filtered = X_train[columns_to_keep].copy()
X_valid_filtered = X_valid[columns_to_keep].copy()

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, good_label_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 17655.86185388128
