<a href="https://colab.research.google.com/github/MingzheHu-Duke/Note_to_product_HousePricePrediction/blob/main/05_functional_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# config.py

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 17 12:47:19 2020
@author: ashutosh.k
"""


DATAPATH = "../data/HousingPrediction/"
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TARGET = 'SalePrice'
## Features to keep
KEEP = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold'] #Final feature to keep in data

NUMERICAL_FEATURES = ['LotFrontage'] #Numerical
CATEGORICAL_FEATURES = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual'] #Categorical

FEATURES_TO_ENCODE = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive'] #Features to Encode

TEMPORAL_FEATURES = ['YearRemodAdd']
TEMPORAL_COMPARISON = 'YrSold'

LOG_FEATURES = ['LotFrontage', '1stFlrSF', 'GrLivArea'] #Features for Log Transform

DROP_FEATURES = ['YrSold'] #Features to Drop

# preprocessor.py

In [1]:
"""
Created on Fri Jul 17 13:04:51 2020

@author: ashutosh.k
"""
import pandas as pd
import numpy as np

import config


# Numerical Imputer
def numerical_imputer(_data, NUMERICAL_FEATURES):
  for var in CATEGORICAL_FEATURES:
    _data[var].fillna(_data[var].mode()[0], inplace=True)
  return data

# Rare label Categorical Encoder
def rare_label_cat_imputer(_data, FEATURES_TO_ENCODE):
  encoder_dict_ = {}
  tol = 0.05

  for var in FEATURES_TO_ENCODE:
    # The encoder will learn the most frequent categories
    t = pd.Series(_data[var].value_counts() / np.float(len(_data)))
    # Frequent labels:
    encoder_dict_[var] = list(t[t >= tol].index)
  
  for var in FEATURES_TO_ENCODE:
    _data[var] = np.where(_data[var].isin(encoder_dict_[var], "Rare"))

  return _data

# Categorical Encoder
def categorical_encoder(_data, FEATURES_TO_ENCODE):
  encoder_dict_ = {}
  for var in FEATURES_TO_ENCODE:
    t = _data[var].value_counts().sort_values(ascending=True).index
    encoder_dict_[var] = {k:i for i, k in enumerate(t, 0)}

  # Mapping using the encoder dictionary
  for var in FEATURES_TO_ENCODE:
    _data[var] = _data[var].map(encoder_dict_[var])

  return _data


# Temporal Variables
def temporal_transform(_data, TEMPORAL_FEATURES, TEMPORAL_COMPARISON):
  for var in TEMPORAL_FEATURES:
    _data[var] = _data[var] - _data[TEMPORAL_COMPARISON]

  return _data


# Log transformation
def log_transformation(_data, LOG_FEATURES):
  for var in LOG_FEATURES:
    _data[var] = np.log(_data[var])
  return _data


def drop_features(_data, DROP_FEATURES):
  _data.drop(DROP_FEATURES, inplace=True)
  return _data

SyntaxError: ignored

# data_management.py

In [None]:
"""
Created on Fri Jul 17 12:59:47 2020
@author: ashutosh.k
Data Management: Read Data and Save Data
"""

import config
import numpy as np

def load_data(file_name):
  _data = pd.read_csv(config.DATAPATH + filed_name)
  return _data

# MainCode.py

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 17 12:57:42 2020
@author: ashutosh.k
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn import metrics


# Import other files/modules
import config
from data_management import load_dataset
import preprocessing as pp

train = load_dataset(config.TRAIN_FILE)
test = load_dataset(config.TEST_FILE)


#Seperate SalePrice in Y
y = train[config.TARGET]
train.drop([config.TARGET], axis=1, inplace=True)

#Combine train and test data
dataw = pd.concat([train, test], axis=0)

data = data[config.KEEP].copy


#Data Preprocessing functions from preprocessors.py
data = pp.numerical_imputer(data, config.NUMERICAL_FEATURES)
data = pp.categorical_imputer(data, config.CATEGORICAL_FEATURES)
data = pp.rare_label_cat_imputer(data, config.FEATURES_TO_ENCODE)
data = pp.categorical_encoder(data, config.FEATURES_TO_ENCODE)
data = pp.temporal_transform(data, config.TEMPORAL_FEATURES, 
                             config.TEMPORAL_COMPARISON)
data = pp.log_transform(data, config.LOG_FEATURES)
data = pp.drop_features(data, config.DROP_FEATURES)

##############################################################################


## Split Train and Test

train_clean = data.iloc[:train.shape[0],:]
test_clean = data.iloc[train.shape[0]:,:]

#Split Train and Test
X_train, X_test, y_train, y_test = train_test_split(train_clean, y, 
                                                    random_state=42, test_size=0.15)
y_train = np.log(y_train)
y_test = np.log(y_test)

# Run Model
model = Lasso(alpha=0.005, random_state=0)
model.fit(X_train,y_train)
pred = model.predict(X_test)

# Model Evaluation
#MSE
print("MSE : ",metrics.mean_squared_error(pred, y_test))
#MAE
print("MAE : ",metrics.mean_absolute_error(pred, y_test))
#RMSE
print("RMSE : ",np.sqrt(metrics.mean_squared_error(pred, y_test)))
#R2
print("R-sq : ",metrics.r2_score(pred, y_test))


# Prediction on actual Test Data
#test_clean is the transformed original test data; x_test is the 15% split from training data, 
#apologies for similar names
pred_test = np.exp(model.predict(test_clean))

print("Top 10 predictions: ",pred_test[1:10])