<a href="https://colab.research.google.com/github/MingzheHu-Duke/Note_to_product_HousePricePrediction/blob/main/07_CleanUp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#pipeline.py

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso

from config import config
import processing.preprocessors as pp

price_pipe = Pipeline(
    [
        ('Numerical Imputer',pp.NumericalImputer(variables = config.NUMERICAL_FEATURES)),
        ('Categorical Imputer', pp.CategoricalImputer(variables = config.CATEGORICAL_FEATURES)),
        ('Temporal Features', pp.TemporalVariableEstimator(variables = config.TEMPORAL_FEATURES, 
        reference_variable=config.TEMPORAL_COMPARISON)),
        ('Rare Label Encoder', pp.RareLabelCategoricalImputer(variables = config.FEATURES_TO_ENCODE)),
        ('Categorical Encoder', pp.CategoricalEncoder(variables=config.FEATURES_TO_ENCODE)),
        ('Log Transform', pp.LogTransformation(variables = config.LOG_FEATURES)),
        ('Drop Features', pp.DropFeatures(variables_to_drop=config.DROP_FEATURES)),
        ('Scaler Transform', MinMaxScaler()),
        ('Linear Model', Lasso(alpha=0.005,random_state=42))
      ]
)

#train_pipeline.py

In [None]:
import pandas as pd
import numpy as np


from config import config
from processing.data_management import load_dataset, save_pipeline
import processing.preprocessors as pp
import pipeline
from predict import make_prediction


def run_training():
  train = load_dataset(config.TRAIN_FILE)
  y = np.log(train[config.TARGET])
  train.drop([config.TARGET], axis=1, inplace=True)
  pipeline.price_pipe.fit(train[config.KEEP], y)
  save_pipeline(pipeline_to_save=pipeline.price_pipe)

if __name__ == "__main__":
  run_training()

# predict.py

In [None]:
import numpy as np
import pandas as pd

from config import config
from processing.data_management import load_pipeline

pipeline_file_name = "lasso_regression_v1.pkl"

_price_pipe = load_pipeline(pipeline_file_name)

def make_prediction(input_data):
  data == pd.Dataframe(input_data)
  prediction = _price_pipe.predict(data[config.KEEP])
  output = np.exp(prediction)

  results = {
      "prediction": output,
      "model_name": pipeline_file_name,
      "version": "version1"
  }
return results

# Config.py

In [None]:
import pathlib
import os


DATAPATH = "../data/HousingPrediction/"
SAVED_MODEL_PATH = "../HousingPriceAllEvoution/07_CleanUp/trained_models/"
## These datapaths need to be changed in the final package

TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TARGET = 'SalePrice'
## Features to keep
KEEP = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold'] #Final feature to keep in data

NUMERICAL_FEATURES = ['LotFrontage'] #Numerical
CATEGORICAL_FEATURES = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual'] #Categorical

FEATURES_TO_ENCODE = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive'] #Features to Encode

TEMPORAL_FEATURES = ['YearRemodAdd']
TEMPORAL_COMPARISON = 'YrSold'

LOG_FEATURES = ['LotFrontage', '1stFlrSF', 'GrLivArea'] #Features for Log Transform

DROP_FEATURES = ['YrSold'] #Features to Drop

# data_management.py

In [None]:
from config import config
import pandas as pd
import joblib

def load_dataset(file_name):
  _data = pd.read_csv(config.DATAPATH + filename)
  return _data

def save_pipeline(pipeline_to_save):
  save_file_name = "lasso"
  save_path = config.SAVED_MODEL_PATH + save_file_name
  joblib.dump(pipeline_to_save, save_path)
  print("Saved Pipeline: ", save_file_name)

def load_ppipeline(pipeine_to_load):
  save_path = config.SAVED_MODEL_PATH
  trained_model = joblib.load(save_path+pipeline_to_load)
  return trained_model