In [1]:
import os
import joblib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import date
from scipy.stats import norm, skew
from dython import nominal, data_utils
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_log_error

# Change display limit to prevent truncating
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> dict[str, str]:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return {'rmsle': f"{round(rmsle, precision)}"}

def save_object(obj, filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(os.path.dirname(current_working_dir), 'models', filename)
    joblib.dump(obj, file_path)
    
def load_object(filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(os.path.dirname(current_working_dir), 'models', filename)
    obj = joblib.load(file_path)
    return obj

## Data Loading

In [3]:
colums_for_ordinal_encoding = ['BsmtQual', 'CentralAir', 'ExterQual', 'KitchenQual', 'GarageFinish', 'Functional', 'FireplaceQu', 'BsmtExposure']
columns_for_one_hot_encoding = ['HouseStyle', 'GarageType', 'Neighborhood', 'Heating', 'Electrical', 'BldgType']

selected_continuous_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'YearsSinceBuilt', 'YearsSinceRemodAdd']
selected_categorical_features = ['HouseStyle', 'CentralAir', 'GarageType', 'GarageFinish', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'Heating', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'BsmtQual']

target_feature = 'SalePrice'

model_filename = 'model.joblib'
ordinal_encoder_filename = 'ordinal_encoder.joblib'
one_hot_encoder_filename = 'one_hot_encoder.joblib'
standard_scaler_filename = 'standard_scaler.joblib'
continuous_features_means_filename = 'continuous_features_means.joblib'

In [4]:
# Loading of train and test data
def read_raw_data(data_path):
    return pd.read_csv(data_path)

def fix_data_type(data):
    fixed_data = data.astype({
        "MSSubClass": object,
        "OverallQual": object,
        "OverallCond": object,
        "MoSold": object,
        "YrSold": object,
        "YearBuilt": object,
        "YearRemodAdd": object,
        "BsmtFullBath": object,
        "BsmtHalfBath": object,
        "FullBath": object,
        "HalfBath": object,
        "BedroomAbvGr": object,
        "KitchenAbvGr": object,
        "TotRmsAbvGrd": object,
        "Fireplaces": object,
        "GarageYrBlt": object,
        "GarageCars": object
    })
    return fixed_data

def update_data_features(data_raw):
    selected_features = ['GrLivArea', 'GarageArea', 'TotalBsmtSF', 'HouseStyle', 'Neighborhood', 'OverallQual', 'ExterQual', 'KitchenQual', 'Functional', 'FireplaceQu', 'YearsSinceBuilt', 'YearsSinceRemodAdd', 'BsmtExposure', 'HalfBath', 'YearsSinceGarageYrBlt', 'Electrical', 'BsmtFullBath', 'BldgType', 'KitchenAbvGr', 'Heating', 'CentralAir', 'GarageType', 'GarageFinish', 'BsmtQual', 'SalePrice']

    # Create additional features for better predictions
    todays_date = date.today()
    
    data_raw['YearsSinceBuilt'] = todays_date.year - data_raw['YearBuilt']
    data_raw['YearsSinceRemodAdd'] = todays_date.year - data_raw['YearRemodAdd']
    data_raw['YearsSinceGarageYrBlt'] = todays_date.year - data_raw['GarageYrBlt']
    return data_raw[selected_features]

def clean_data(data_raw):
    fixed_type_data_raw = fix_data_type(data_raw)
    updated_data_raw = update_data_features(fixed_type_data_raw)
    return updated_data_raw

In [5]:
data_raw = read_raw_data('../data/train.csv')
data_raw.sample(n=5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
773,774,20,RL,70.0,10150,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,5,1958,1958,Gable,CompShg,Wd Sdng,Wd Sdng,,1.0,TA,TA,CBlock,TA,TA,No,Rec,456,Unf,0,456,912,GasA,Ex,Y,FuseA,912,0,0,912,0,0,1,0,2,1,TA,5,Typ,0,,Attchd,1958.0,RFn,1,275,TA,TA,Y,0,0,0,0,0,0,,,,0,7,2007,COD,Normal,114500
46,47,50,RL,48.0,12822,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Ex,TA,No,GLQ,1351,Unf,0,83,1434,GasA,Ex,Y,SBrkr,1518,631,0,2149,1,0,1,1,1,1,Gd,6,Typ,1,Ex,Attchd,2003.0,RFn,2,670,TA,TA,Y,168,43,0,0,198,0,,,,0,8,2009,WD,Abnorml,239686
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,1.5Fin,7,8,1918,1998,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,BrkTil,TA,TA,No,Unf,0,Unf,0,912,912,GasA,Gd,Y,SBrkr,912,514,0,1426,0,0,1,1,3,1,Gd,7,Typ,1,Gd,Detchd,1925.0,Unf,1,216,TA,TA,Y,0,240,0,0,0,0,,GdPrv,,0,6,2007,WD,Normal,189950
790,791,120,RL,43.0,3182,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2005,2006,Gable,CompShg,VinylSd,VinylSd,BrkFace,11.0,Gd,TA,PConc,Gd,TA,No,GLQ,16,Unf,0,1129,1145,GasA,Ex,Y,SBrkr,1145,0,0,1145,0,0,2,0,2,1,Gd,5,Typ,1,Gd,Attchd,2005.0,Fin,2,397,TA,TA,Y,100,16,0,0,0,0,,,,0,9,2009,WD,Normal,160200
1292,1293,70,RM,60.0,6600,Pave,,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,Norm,1Fam,2Story,5,4,1892,1965,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,Stone,TA,TA,No,Unf,0,Unf,0,994,994,GasA,TA,N,SBrkr,1378,994,0,2372,0,0,2,0,4,2,TA,11,Min2,0,,Attchd,1985.0,RFn,1,432,TA,TA,Y,0,287,0,0,0,0,,,,0,12,2009,WD,Normal,107500


In [6]:
def split_train_data_raw(train_data_raw, target_feature):
    y = train_data_raw["SalePrice"]
    X = train_data_raw.drop("SalePrice", axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)
    
    return train_data, test_data

## Model training

In [7]:
def handling_missing_data(data):
    continuous_features_means = load_object(continuous_features_means_filename)
    
    data.loc[:, 'GrLivArea'] = data.loc[:, 'GrLivArea'].fillna(continuous_features_means['GrLivArea'])
    data.loc[:, 'GarageArea'] = data.loc[:, 'GarageArea'].fillna(continuous_features_means['GarageArea'])
    data.loc[:, 'TotalBsmtSF'] = data.loc[:, 'TotalBsmtSF'].fillna(continuous_features_means['TotalBsmtSF'])
    
    data.loc[:, 'HouseStyle'] = data.loc[:, 'HouseStyle'].fillna('No')
    data.loc[:, 'Neighborhood'] = data.loc[:, 'Neighborhood'].fillna('No')
    data.loc[:, 'OverallQual'] = data.loc[:, 'OverallQual'].fillna(0).astype(object)
    data.loc[:, 'ExterQual'] = data.loc[:, 'ExterQual'].fillna('No')
    data.loc[:, 'KitchenQual'] = data.loc[:, 'KitchenQual'].fillna('No')
    data.loc[:, 'Functional'] = data.loc[:, 'Functional'].fillna('No')
    data.loc[:, 'FireplaceQu'] = data.loc[:, 'FireplaceQu'].fillna('No')
    data.loc[:, 'YearsSinceRemodAdd'] = data.loc[:, 'YearsSinceRemodAdd'].fillna(0)
    data.loc[:, 'YearsSinceBuilt'] = data.loc[:, 'YearsSinceBuilt'].fillna(0)
    data.loc[:, 'BsmtExposure'] = data.loc[:, 'BsmtExposure'].fillna('No')
    data.loc[:, 'HalfBath'] = data.loc[:, 'HalfBath'].fillna(0).astype(object)
    data.loc[:, 'YearsSinceGarageYrBlt'] = data.loc[:, 'YearsSinceGarageYrBlt'].fillna(0)
    data.loc[:, 'Electrical'] = data.loc[:, 'Electrical'].fillna('No')
    data.loc[:, 'BsmtFullBath'] = data.loc[:, 'BsmtFullBath'].fillna(0).astype(object)
    data.loc[:, 'BldgType'] = data.loc[:, 'BldgType'].fillna('No')
    data.loc[:, 'KitchenAbvGr'] = data.loc[:, 'KitchenAbvGr'].fillna(0).astype(object)
    data.loc[:, 'Heating'] = data.loc[:, 'Heating'].fillna('No')
    data.loc[:, 'CentralAir'] = data.loc[:, 'CentralAir'].fillna('No')
    data.loc[:, 'GarageType'] = data.loc[:, 'GarageType'].fillna('No')
    data.loc[:, 'GarageFinish'] = data.loc[:, 'GarageFinish'].fillna('No')
    data.loc[:, 'BsmtQual'] = data.loc[:, 'BsmtQual'].fillna('No')
    return data

### Data Transformation

In [8]:
def fix_skewness(continuous_data, continuous_features):
    # Transform the continuous features with the skewness of more than 0.5
    skewness = continuous_data.apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    skewed_features = skewness.index
    continuous_data[skewed_features] = np.log1p(continuous_data[skewed_features].astype(float))
    return continuous_data

def fit_ordinal_encoding_features(categorical_data, colums_for_ordinal_encoding):
    ordinal_encoder = OrdinalEncoder(categories=[
        ["No", "Po", "Fa", "TA", "Gd", "Ex"],
        ["N", "Y"],
        ["No", "Po", "Fa", "TA", "Gd", "Ex"],
        ["No", "Po", "Fa", "TA", "Gd", "Ex"],
        ["No", "Unf", "RFn", "Fin"],
        ["No", "Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
        ["No", "Po", "Fa", "TA", "Gd", "Ex"],
        ["No", "Mn", "Av", "Gd"]
    ])
    ordinal_encoder.fit(categorical_data[colums_for_ordinal_encoding])
    save_object(ordinal_encoder, ordinal_encoder_filename)

def transform_ordinal_encoding_features(categorical_data, colums_for_ordinal_encoding):
    ordinal_encoder = load_object(ordinal_encoder_filename)
    categorical_data[colums_for_ordinal_encoding] = ordinal_encoder.transform(categorical_data[colums_for_ordinal_encoding])
    return categorical_data

def fit_one_hot_encoding_features(categorical_data, columns_for_one_hot_encoding):
    one_hot_encoder = OneHotEncoder()
    one_hot_encoder.fit(categorical_data[columns_for_one_hot_encoding])
    save_object(one_hot_encoder, one_hot_encoder_filename)

def transform_one_hot_encoding_features(categorical_data, columns_for_one_hot_encoding):
    one_hot_encoder = load_object(one_hot_encoder_filename)
    one_hot_encoder_columns = one_hot_encoder.get_feature_names_out()
    categorical_data.loc[:, one_hot_encoder_columns] = one_hot_encoder.transform(categorical_data[columns_for_one_hot_encoding]).toarray()
    categorical_data.drop(columns_for_one_hot_encoding, axis=1, inplace=True)
    return categorical_data

def fit_scaling_features(data, continuous_features):
    # Scaling
    standard_scaler = StandardScaler()
    standard_scaler.fit(data.loc[:, selected_continuous_features])
    save_object(standard_scaler, standard_scaler_filename)

def transform_scaling_features(data, continuous_features):
    standard_scaler = load_object(standard_scaler_filename)
    data.loc[:, continuous_features] = standard_scaler.transform(data.loc[:, continuous_features])
    return data

def fit_features(data, categorical_features, continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding):
    data_categorical = data[categorical_features]
    data_continuous = data[continuous_features]
    fit_ordinal_encoding_features(data_categorical, colums_for_ordinal_encoding)
    fit_one_hot_encoding_features(data_categorical, columns_for_one_hot_encoding)
    fit_scaling_features(data_continuous, continuous_features)
    
def transform_features(data, target_feature, categorical_features, continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding):
    y = np.log1p(data[target_feature]) if target_feature else None
    data_categorical = data[categorical_features]
    data_continuous = data[continuous_features]
    data_continuous = fix_skewness(data_continuous, continuous_features)
    data_categorical = transform_ordinal_encoding_features(data_categorical, colums_for_ordinal_encoding)
    data_categorical = transform_one_hot_encoding_features(data_categorical, columns_for_one_hot_encoding)
    data_continuous = transform_scaling_features(data_continuous, continuous_features)
    X = pd.concat([data_continuous, data_categorical], axis = 1)
    return X, y

def train_model(X, y):
    # Train
    model = LinearRegression()
    model.fit(X, y)
    save_object(model, model_filename)
    return model

def predict(data, model_filename):
    model = load_object(model_filename)
    y_pred = model.predict(data)
    return y_pred

### Model training

In [9]:
def build_model(data_raw: pd.DataFrame) -> dict[str, str]:
    # Returns a dictionary with the model performances (for example {'rmse': 0.18})
    cleaned_data = clean_data(data_raw)
    train_data, test_data = split_train_data_raw(cleaned_data, target_feature)
    # Calculate the mean of continuous features and saving them
    continuous_features_means = {
        'GrLivArea': np.mean(train_data['GrLivArea']), 
        'GarageArea': np.mean(train_data['GarageArea']), 
        'TotalBsmtSF': np.mean(train_data['TotalBsmtSF'])
    }
    save_object(continuous_features_means, continuous_features_means_filename)
    # Handle missing data
    train_data = handling_missing_data(train_data)
    test_data = handling_missing_data(test_data)
    # Fit train data
    fit_features(train_data, selected_categorical_features, selected_continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding)
    # Transform train data and test data
    X_train, y_train = transform_features(train_data, target_feature, selected_categorical_features, selected_continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding)
    X_test, y_test = transform_features(test_data, target_feature, selected_categorical_features, selected_continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding)
    # Train model and predict
    model = train_model(X_train, y_train)
    y_pred = predict(X_test, model_filename)
    result = compute_rmsle(y_test, y_pred)
    return result

In [10]:
%%capture
model = build_model(data_raw)

In [11]:
model

{'rmsle': '0.01'}

### Model inference

In [12]:
%%capture
def make_predictions(input_data_raw: pd.DataFrame) -> np.ndarray:
    # the model and all the data preparation objects (encoder, etc) should be loaded from the models folder
    data = clean_data(input_data_raw)
    data = handling_missing_data(data)
    X, y = transform_features(data, None, selected_categorical_features, selected_continuous_features, colums_for_ordinal_encoding, columns_for_one_hot_encoding)
    y_pred = predict(X, model_filename)
    return np.expm1(y_pred)

data_raw = read_raw_data('../data/test.csv')
y_pred = make_predictions(data_raw)

In [13]:
y_pred

array([223083.1508802 , 158696.89612023, 143835.55883217, 229732.14379321,
       103838.38667389,  96353.82504599, 262008.92128838, 127642.53478439,
       563982.37134699, 154447.26635463, 197813.49333133, 158898.04922464,
       223715.26304986, 135891.52483205, 140451.46681198, 155040.01554255,
       264412.01966468, 120728.6042961 , 147775.55473449, 174489.53826223,
       145966.96025844, 151321.36916141,  82279.40168746, 166871.16881056,
       199791.91807019, 184746.15188151, 166557.65862858,  77052.07058983,
       282458.34429384, 127817.02323318, 178369.29515536, 210549.88340002,
       153851.64888968, 329556.47660227, 342624.0086541 , 184073.72363256,
       288007.25473187, 128151.38594607, 246741.39924585, 358931.58175635,
       229552.01855485, 143229.43410595, 201840.0459751 , 326112.85441191,
       370652.09607794, 153654.25333821, 119023.16955423, 132099.93159599,
       173161.37065119, 100775.41159676, 392670.20142912, 147882.21929109,
       176190.21696273,  