<a href="https://colab.research.google.com/github/MingzheHu-Duke/Note_to_product_HousePricePrediction/blob/main/02_housing_price_prediction_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import Lasso
 from sklearn import metrics

**Config Variables**

In [None]:
DATAPTH = "data/HousingPrediction/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

TARGET = 'SalePrice'
## Features to keep
KEEP = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold'] #Final feature to keep in data

NUMERICAL_FEATURES = ['LotFrontage'] #Numerical
CATEGORICAL_FEATURES = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual'] #Categorical

FEATURES_TO_ENCODE = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive'] #Features to Encode

TEMPORAL_FEATURES = ['YearRemodAdd']
TEMPORAL_COMPARISON = 'YrSold'

LOG_FEATURES = ['LotFrontage', '1stFlrSF', 'GrLivArea'] #Features for Log Transform

DROP_FEATURES = ['YrSold'] #Features to Drop

**Read Data**

In [None]:
train = pd.read_csv(DATAPATH+TRAIN_FILE)
test = pd.read_csv(DATAPATH+TEST_FILE)


#separating SalePrice in Y
y = train[TARGET]
train.drop([TARGET], axis=1, inplace=True)

#Combine train and test data
data = pd.concat([train,test], axis=0)

data = data[KEEP].copy()

**Numerical Imputer**

In [None]:
for var in NUMERICAL_FEATURES:
    data[var].fillna(data[var].mode()[0], inplace=True)

**Categorical Imputer**

In [None]:
 for var in CATEGORICAL_FEATURES:
    data[var].fillna(data[var].mode()[0], inplace=True)

**Rare Label Categorical Encoder** 

In [None]:
encoder_dict_ = {}
tol=0.05

for var in FEATURES_TO_ENCODE:
    # the encoder will learn the most frequent categories
    t = pd.Series(data[var].value_counts() / np.float(len(data)))
    # frequent labels:
    encoder_dict_[var] = list(t[t >= tol].index)
    
for var in FEATURES_TO_ENCODE:
    data[var] = np.where(data[var].isin(
                encoder_dict_[var]), data[var], 'Rare')

**Categorical Encoder**

In [None]:
encoder_dict_ ={}
for var in FEATURES_TO_ENCODE:
    t = data[var].value_counts().sort_values(ascending=True).index  #Sorting on freq, should be done on target, just saving some time here
    encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}

In [None]:
## Mapping using the encoder dictionary
for var in FEATURES_TO_ENCODE:
    data[var] = data[var].map(encoder_dict_[var])

**Temporal Variables**

In [None]:
for var in TEMPORAL_FEATURES:
    data[var] = data[var]-data[TEMPORAL_COMPARISON]

**Log Transformation of Numerical Features**


In [None]:
for var in LOG_FEATURES:
    data[var] = np.log(data[var])

**Drop Features**

In [None]:
data.drop(DROP_FEATURES, axis=1, inplace=True)