# Data Engineering

In [1]:
# install category-encoders if not installed
# !pip install category-encoders
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

## Import data and split feature and label

In [2]:
df = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
col_drop = df.columns[df.nunique()==1]
df.drop(col_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Separate the features from the labels

In [3]:
Xtrain = df.copy()
ytrain = Xtrain.loc[:,['SalePrice']]
Xtrain = Xtrain.drop('SalePrice', axis = 1)
Xtrain.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


#### Separete the train, validation, and test sets

In [6]:
from sklearn.model_selection import train_test_split

# split train and validation dataset
Xtrain, Xtest, ytrain, ytest = train_test_split(Xtrain, ytrain, test_size = 0.2, random_state = 42)
Xval, Xtest, yval, ytest = train_test_split(Xtest, ytest, test_size = 0.5, random_state = 42)

## Data Preprocessing

### Separete categorical features and numerical features

In [7]:
# get the binary categorical colums
bin_cols = Xtrain.select_dtypes(include=['object']).columns[Xtrain.select_dtypes(include=['object']).nunique() == 2].tolist()
# get the rest categorical columns
ord_cols = [col for col in Xtrain.columns if col not in bin_cols and Xtrain[col].dtype == 'object']
# get the numerical categorical columns
num_cols = Xtrain.select_dtypes(include=['int64', 'float64']).columns
# ensure the numerical columns have only numerical values
Xtrain[num_cols] = Xtrain[num_cols].apply(pd.to_numeric, errors='coerce')

### Create encoder instances

In [8]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
import category_encoders as ce


std = StandardScaler()
cte = ce.CountEncoder(cols=ord_cols, normalize=True, handle_unknown='value')
ohe = ce.OneHotEncoder(cols=bin_cols, use_cat_names=False, handle_unknown='value')
be = ce.BinaryEncoder(cols=bin_cols)


### Encode the labels

In [9]:
# encode the price with StandardScaler
std_label = StandardScaler()
ytrain_encoded = pd.DataFrame(std_label.fit_transform(ytrain), columns=ytrain.columns)
yval_encoded = pd.DataFrame(std_label.transform(yval), columns = yval.columns)

### Encode the features of train, test and validation sets

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# encoding the features
# one-hot encoding the binary features
ohe.fit(Xtrain)
Xtrain_encoded = ohe.transform(Xtrain)
Xval_encoded = ohe.transform(Xval)
Xtest_encoded = ohe.transform(Xtest)

cte.fit(Xtrain_encoded)
Xtrain_encoded = cte.transform(Xtrain_encoded)
Xval_encoded = cte.transform(Xval_encoded)
Xtest_encoded = cte.transform(Xtest_encoded)

std.fit(Xtrain_encoded[num_cols])
Xtrain_scaled = std.transform(Xtrain_encoded[num_cols])
Xtrain_encoded[num_cols] = Xtrain_scaled
Xval_scaled = std.transform(Xval_encoded[num_cols])
Xval_encoded[num_cols] = Xval_scaled
Xtest_scaled = std.transform(Xtest_encoded[num_cols])
Xtest_encoded[num_cols] = Xtest_scaled

# Build Model

## Machine Learning Approach
Let's use the gradient boosting regression model to predict the price of a house.

In [11]:
# model setup
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(learning_rate=0.1,
                                      n_estimators=100,
                                      subsample=0.8,
                                      max_depth=10,
                                      random_state=42,
                                      max_features='auto')

## Training the model

In [14]:
Xtrain_encoded.replace(np.nan, 0, inplace=True)
gbr_model.fit(Xtrain_encoded, ytrain_encoded)

  y = column_or_1d(y, warn=True)


## Model evaluation

In [15]:
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
  rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
  return round(rmsle, precision)

### Machine Learning model evaluation

In [17]:
Xtest_encoded.replace(np.nan, 0, inplace=True)
y_pred = gbr_model.predict(Xtest_encoded)
y_pred = y_pred.reshape(len(y_pred), 1)

y_hat = std_label.inverse_transform(y_pred).flatten()
ytest_true =ytest['SalePrice'].values.flatten()

In [18]:
rmse = compute_rmsle(ytest_true, y_hat)
rmse

0.14

The machine learning approach with gradient boosting is a simpler and better solution than the neural network approach for this task. It could be due to the fact that we have a very small number of samples whereas the neural network approach often requires a large number of samples to learn as there are more parameters that need to be tuned.

# Refactor the Data Engineering Process

In [21]:
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

ROOT = '../'
MODELS_DIR = ROOT + '/models'
SCALER_PATH = MODELS_DIR + '/scaler.joblib'
FEATURE_LIST = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
LABEL_COLUMN = 'SalePrice'
NUMERICAL_COLUMNS = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
CATEGORICAL_COLUMNS = ['Foundation', 'KitchenQual']
# Filter on used columns
df = df[FEATURE_LIST + [LABEL_COLUMN]]

# scale data
scaler = StandardScaler()
scaler.fit(df[NUMERICAL_COLUMNS])
joblib.dump(scaler, MODELS_DIR + '/scaler.joblib')
numerical_features_scaled = scaler.transform(df[NUMERICAL_COLUMNS])
numerical_features_scaled_df = pd.DataFrame(data=numerical_features_scaled, columns=NUMERICAL_COLUMNS)
# encode data
ohe = OneHotEncoder()
ohe.fit(df[CATEGORICAL_COLUMNS])
joblib.dump(ohe, MODELS_DIR + '/ohe.joblib')
categorical_features_encoded = ohe.transform(df[CATEGORICAL_COLUMNS])
categorical_features_encoded_df = pd.DataFrame.sparse.from_spmatrix(
    data=categorical_features_encoded, columns=ohe.get_feature_names_out()
)
# join dataframe
final_df = numerical_features_scaled_df.join(categorical_features_encoded_df).join(df[LABEL_COLUMN])
# Split data
X, y = final_df.drop(columns=[LABEL_COLUMN]), final_df[LABEL_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

def get_scaler(training_data: pd.DataFrame = None, training_mode: bool = False):
    if training_mode:
        scaler = StandardScaler()
        scaler.fit(training_data)
        joblib.dump(scaler, SCALER_PATH)
    else:
        scaler = joblib.load(SCALER_PATH)
    return scaler
# Inference mode
scaler = get_scaler(training_mode=False)
# scaler.transform(Xtest_encoded)

In [51]:
type(df[NUMERICAL_COLUMNS])

pandas.core.frame.DataFrame

In [48]:
import os
import joblib
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ROOT = '../'
MODELS_DIR = ROOT + 'models'
SCALER_PATH = MODELS_DIR + '/scaler.joblib'
OHE_PATH = MODELS_DIR + '/ohe.joblib'

df_train = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')

def preprocessing(data_df: pd.DataFrame, training_mode=True):
    # remove columns that only have one unique value
    col_drop = data_df.columns[data_df.nunique()==1]
    data_df.drop(col_drop, axis=1, inplace=True)
    
    # FEATURE_LIST = data_df.drop(['SalePrice'], axis=1).columns.tolist()
    CATEGORICAL_COLUMNS = data_df.select_dtypes(include=['object']).columns.tolist()
    if training_mode:
        NUMERICAL_COLUMNS = data_df.drop(['SalePrice'], axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
    else:
        NUMERICAL_COLUMNS = data_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    LABEL_COLUMN = ['SalePrice']
    
    # Preprocessing the dataset
    scaler = StandardScaler()
    scaler.fit(data_df[NUMERICAL_COLUMNS])
    joblib.dump(scaler, SCALER_PATH)
    numerical_features_scaled = scaler.transform(data_df[NUMERICAL_COLUMNS])
    numerical_features_scaled_df = pd.DataFrame(numerical_features_scaled, columns=NUMERICAL_COLUMNS)
    numerical_features_scaled_df = numerical_features_scaled_df.apply(pd.to_numeric, errors='coerce')
    
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(data_df[CATEGORICAL_COLUMNS])
    joblib.dump(ohe, OHE_PATH)
    categorical_features_encoded = ohe.transform(data_df[CATEGORICAL_COLUMNS])
    categorical_features_encoded_df = pd.DataFrame.sparse.from_spmatrix(
        data=categorical_features_encoded, columns=ohe.get_feature_names_out()
    )
    
    if training_mode:
        data_encoded = numerical_features_scaled_df.join(categorical_features_encoded_df).join(data_df[LABEL_COLUMN])
        X, y = data_encoded.drop(LABEL_COLUMN, axis=1), data_encoded[LABEL_COLUMN]
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
        X.replace(np.nan, 0, inplace=True)
        return Xtrain, Xtest, ytrain, ytest
    else:
        data_encoded = numerical_features_scaled_df.join(categorical_features_encoded_df)
        X = data_encoded
        X.replace(np.nan, 0, inplace=True)
        return X



# Model Building

## Model Training

In [38]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# class model_training():
#     def __init__(self, Xtrain, ytrain, learning_rate=0.1, n_estimaters = 100, subsample = 0.8, max_depth = 10, max_features = 'auto'):
#         self.Xtrain = Xtrain
#         self.ytrain = ytrain
#         self.learning_rate = learning_rate
#         self.n_estimaters = n_estimaters
#         self.subsample = subsample
#         self.max_depth = max_depth
#         self.max_features = max_features

# def model_training(file_name, learning_rate, n_estimaters, subsample, max_depth, max_features, random_state):
    
# model setup

def build_model(data_df: pd.DataFrame):

    Xtrain, Xtest, ytrain, ytest = preprocessing(data_df, training_mode=True)
    # Xtrain.replace(np.nan, 0, inplace=True)
    # Xtest.replace(np.nan, 0, inplace=True)
    
    model = GradientBoostingRegressor(learning_rate = 0.01,
                                        n_estimators = 100,
                                        subsample = 0.8,
                                        max_depth = 10,
                                        max_features = 'auto',
                                        random_state = 42)
    # train model
    model.fit(Xtrain, ytrain)
    
    y_pred = model.predict(Xtest)
    mse = mean_squared_error(ytest, y_pred)
    rmse = mean_squared_error(ytest, y_pred, squared=False)
    # return trained model
    return model, mse, rmse


In [39]:
model, mse, rmse = build_model(df_train)

  y = column_or_1d(y, warn=True)


In [40]:
joblib.dump(model, MODELS_DIR + '/model.joblib')

['../models/model.joblib']

## Model Evaluation

In [41]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

# class model_evaluation():
#     def __init__(self, model, file_name, std_label):
#         self.model = model
#         self.file_name = file_name
#         self.std_label = std_label

def compute_rmsle(ytest: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(ytest, y_pred))
    return round(rmsle, precision)

def evaluation(test_data: pd.DataFrame, model):
    
    Xtest, ytest = preprocessing(test_data, training_mode=False)
    
    y_pred = model.predict(Xtest).flatten()
    ytest_true = ytest.values.flatten()
    
    mse = mean_squared_log_error(ytest_true, y_pred)
    rmse = compute_rmsle(ytest_true, y_pred)
    
    return mse, rmse


# Model Inference

In [42]:

import os
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# class model_inference():
#     def __init__(self, file_name, model_name):
#         self.file_name = file_name
#         self.model_name = model_name



def make_prediction(test_df: pd.DataFrame):
    # Load encoders and scaler
    scaler = joblib.load(SCALER_PATH)
    ohe = joblib.load(OHE_PATH)
    model = joblib.load(MODELS_DIR + '/model.joblib')
    # Data preprocessing
    X = test_df.copy()
    
    X_num_encoded = scaler.transform(X[X.select_dtypes(include=['int64', 'float64']).columns.tolist()])
    X_num_encoded_df = pd.DataFrame(X_num_encoded, columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist())
    X_cat_encoded  = ohe.transform(X[X.select_dtypes(include=['object']).columns.tolist()])
    X_cat_encoded_df = pd.DataFrame.sparse.from_spmatrix(data=X_cat_encoded, columns = ohe.get_feature_names_out())
    X_encoded = X_num_encoded_df.join(X_cat_encoded_df)
    X_encoded.replace(np.nan, 0, inplace=True)
    # Make prediction
    prediction = model.predict(X_encoded)

    return prediction


In [43]:
test_df = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')

In [44]:
prediction = make_prediction(test_df)



In [45]:
prediction

array([147835.11112461, 163623.94951657, 180060.42701806, ...,
       164271.39527764, 140981.22324917, 208172.45352428])

In [46]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
