## Build Model

Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import make_scorer
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor



Global Variables

In [2]:
#path variables
path_to_file = '../data/house-prices-advanced-regression-techniques/train.csv' 
model_store = '../models/model.joblib'    
model_encoder_1 = '../models/encoder.joblib'    
model_scaler_1 = '../models/scaler.joblib'    

target = 'SalePrice'
feature_list = ['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BedroomAbvGr',
                 'KitchenAbvGr', 'GarageArea',
                 '1stFlrSF', 'MSZoning', 'Heating']

# identifying and splitting features into continuous and categorical
numeric_features = ['Id', 'LotArea', 'YearBuilt',
                 'BsmtFinSF1', 'BedroomAbvGr',
                    'KitchenAbvGr', 'GarageArea','1stFlrSF']

categorical_features = ['MSZoning', 'Heating']

# Read the CSV file into a DataFrame
processed_df = pd.read_csv(path_to_file)

# Save the DataFrame to a parquet file
processed_df.to_parquet(r'../data/house-prices-advanced-regression-techniques/data_processed.parquet', index=False)


Processing functions

In [3]:
#This function sieves the dataset for only useful features and target
def sieve_data(df):
  sieved_data = df[feature_list].join(df[target])
  return sieved_data



def drop_missing_rows(df):
  complete_rows = df.dropna()
  return complete_rows



def scale_numeric(df):
  scaler = StandardScaler()
  scaler.fit(df[numeric_features])
  scaled_data = scaler.transform(df[numeric_features])
  scaled_df = pd.DataFrame(data=scaled_data, columns=numeric_features)
  joblib.dump(scaler, open(model_scaler_1, 'wb'))
  return scaled_df




def encode_categorical(df):
  encoder = OrdinalEncoder()
  encoder.fit(df[categorical_features])
  encoded_data = encoder.transform(df[categorical_features])
  encoded_df = pd.DataFrame(data=encoded_data, columns=categorical_features)
  joblib.dump(encoder, open(model_encoder_1, 'wb'))
  return encoded_df



def hold_on_target(df):
  held_target = df[target]
  held_df = pd.DataFrame(held_target)
  return held_df



def comb_scal_enco(df1, df2, df3):
  merge_coders = df1.join(df2).join(df3)
  return merge_coders




def preprocessing_step(df):
  #sieving
  output_sieve = sieve_data(df)
  #drop nan
  output_drop = drop_missing_rows(output_sieve)
  #scaling
  output_scale = scale_numeric(output_drop)
  #encode cat
  output_encode = encode_categorical(output_drop)
  #holding taget in place
  output_hold = hold_on_target(output_drop)
  #Feature engineering by combining
  preprocessed_output = comb_scal_enco(output_scale, output_encode, output_hold)
  return preprocessed_output


Model Building

In [4]:
model_store = '../models/model.joblib'   
model_encoder_1 = '../models/encoder.joblib'   
model_scaler_1 = '../models/scaler.joblib'   

In [5]:
def training_data(preprocessed_df):
    # Splitting the data
    X, y = preprocessed_df.drop([target], axis=1), preprocessed_df[target]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Training the model
    hgb_regressor = HistGradientBoostingRegressor()
    trained_model = hgb_regressor.fit(X_train, y_train)
    #joblib.dump(hgb_regressor, model_store)
    return X_val, y_val, trained_model




def evaluation_trained_model(X_val, y_val, trained_model):
    # Predicting values for evaluation
    y_pred = trained_model.predict(X_val)

    # Evaluation metrics score
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))

    return {'RMSLE': rmsle}




def build_model(data: pd.DataFrame) -> dict[str, str]:
    # Preprocessing
    preprocessed_df = preprocessing_step(data)

    # Training
    X_val, y_val, trained_model = training_data(preprocessed_df)

    # Evaluation and result
    performance = evaluation_trained_model(X_val, y_val, trained_model)

    #save joblib
    joblib.dump(trained_model,open(model_store, 'wb'))

    return performance




In [6]:
# Call the build_model function with numeric and categorical features
result = build_model(processed_df)
result

{'RMSLE': 0.20361223177145288}

## Inference

In [7]:
test_raw = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
test = test_raw.copy()

test = test[['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BedroomAbvGr',
                 'KitchenAbvGr', 'GarageArea',
                 '1stFlrSF', 'MSZoning', 'Heating']]




In [8]:
numeric_features = ['Id', 'LotArea', 'YearBuilt',
                 'BsmtFinSF1', 'BedroomAbvGr',
                    'KitchenAbvGr', 'GarageArea','1stFlrSF']

categorical_features = ['MSZoning', 'Heating']

In [9]:
#This function sieves the dataset for only useful features and target
def sieve_data(df):
  sieved_data = df[feature_list]
  return sieved_data



def drop_missing_rows(df):
  complete_rows = df.dropna()
  return complete_rows



def scale_numeric(df):
  scaler = joblib.load(model_scaler_1)
  scaler.fit(df[numeric_features])
  scaled_data = scaler.transform(df[numeric_features])
  scaled_df = pd.DataFrame(data=scaled_data, columns=numeric_features)
  return scaled_df



def encode_categorical(df):
  encoder = joblib.load(model_encoder_1)
  encoder.fit(df[categorical_features])
  encoded_data = encoder.transform(df[categorical_features])
  encoded_df = pd.DataFrame(data=encoded_data, columns=categorical_features)
  return encoded_df



def comb_scal_enco(df1, df2):
  merge_coders = df1.join(df2)
  return merge_coders



def preprocessing_step(df):
  #sieving
  output_sieve = sieve_data(df)
  #drop nan
  output_drop = drop_missing_rows(output_sieve)
  #scaling
  output_scale = scale_numeric(output_drop)
  #encode cat
  output_encode = encode_categorical(output_drop)
  #Feature engineering by combining
  preprocessed_output = comb_scal_enco(output_scale, output_encode)
  return preprocessed_output



def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
  #preprocessing dataframe
  preprocessed_data = preprocessing_step(input_data)


  # #Loading the model
  model = joblib.load(model_store)

  prediction = model.predict(preprocessed_data)

  return prediction



In [10]:
result11 = make_predictions(test)
result11

array([144918.89683241, 188388.12939049, 212743.11347236, ...,
       229467.84496425, 150117.15363956, 231869.11236182])