<a href="https://colab.research.google.com/github/MingzheHu-Duke/Note_to_product_HousePricePrediction/blob/main/01_HousingPricePreiction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Housing price prediction

In [None]:
data_folder = "data/HousingPrediction/"

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn import metrics

**Read Data**

In [None]:
train = pd.read_csv(data_folder + "train.csv")
test = pd.read_csv(data_folder + "test.csv")

**SalePrice is the target column**

In [None]:
# Separating Saleprice in Y
y = train["SalePrice"]
train.drop(["SalePrice"], axis=1, inplace=True)

In [None]:
# Combine the train and test data
data = pd.concat([train, test], axis=0)

In [None]:
# Features to keep
keep = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold']

In [None]:
data = data[keep].copy()

**Numerical Imputer**

In [None]:
#Missing values for the column ["LotFrontage"]
num_features = ["LotFrontage"]

for var in num_features:
  data[var].fillna(data[var].mode()[0], inplace=True)

**Categorical Imputer**

In [None]:
cat_features = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual']

In [None]:
# Check missing values
data[cat_features].isnull().sum()

MasVnrType        24
BsmtQual          81
BsmtExposure      82
FireplaceQu     1420
GarageCars         1
GarageType       157
GarageFinish     159
MSZoning           4
BsmtFullBath       2
KitchenQual        1
dtype: int64

In [None]:
for var in cat_features:
  data[var].fillna(data[var].mode()[0], inplace=True)

In [None]:
data[cat_features].isnull().sum()

MasVnrType      0
BsmtQual        0
BsmtExposure    0
FireplaceQu     0
GarageCars      0
GarageType      0
GarageFinish    0
MSZoning        0
BsmtFullBath    0
KitchenQual     0
dtype: int64

In [None]:
#Encoding for rare values : Abover a certain threshold percentage
features_to_encode = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive']

In [None]:
encoder_dict_ = {}
tol = 0.05

for var in features_to_encode:
  # The encoder will learn the most common category
  t = pd.Series(data[var].value_counts()/np.float(len(data)))
  # Frequent Labels
  encoder_dict_[var] = list(t[t >= tol].index)

for var in features_to_encode:
  data[var] = np.where(data[var].isin(encoder_dict[var]), data[var], "Rare")

**Categorical Encoder**

In [None]:
features_to_encode = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive']

In [None]:
data["MSZoning"].value_counts()

RL      2269
RM       460
Rare     190
Name: MSZoning, dtype: int64

In [None]:
encoder_dict_ = {}
for var in features_to_encode:
  t = data[var].value_counts().sort_values(ascending=True).index
  encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}

In [None]:
encoder_dict_

{'BsmtExposure': {'Av': 2, 'Gd': 1, 'Mn': 0, 'No': 3},
 'BsmtQual': {'Ex': 1, 'Gd': 2, 'Rare': 0, 'TA': 3},
 'CentralAir': {'N': 0, 'Y': 1},
 'FireplaceQu': {'Gd': 2, 'Rare': 0, 'TA': 1},
 'GarageFinish': {'Fin': 0, 'RFn': 1, 'Unf': 2},
 'GarageType': {'Attchd': 3, 'BuiltIn': 1, 'Detchd': 2, 'Rare': 0},
 'HeatingQC': {'Ex': 3, 'Gd': 1, 'Rare': 0, 'TA': 2},
 'KitchenQual': {'Ex': 1, 'Gd': 2, 'Rare': 0, 'TA': 3},
 'MSZoning': {'RL': 2, 'RM': 1, 'Rare': 0},
 'MasVnrType': {'BrkFace': 2, 'None': 3, 'Rare': 0, 'Stone': 1},
 'Neighborhood': {'CollgCr': 6,
  'Edwards': 4,
  'Gilbert': 1,
  'NAmes': 7,
  'NridgHt': 2,
  'OldTown': 5,
  'Rare': 8,
  'Sawyer': 0,
  'Somerst': 3},
 'PavedDrive': {'N': 1, 'Rare': 0, 'Y': 2},
 'RoofStyle': {'Gable': 2, 'Hip': 1, 'Rare': 0}}

In [None]:
# Mapping using the encoder dictionary
for var in features_to_encode:
  data[var] = data[var].map(encoder_dict_[var])

In [None]:
data.columns

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'PavedDrive', 'LotFrontage', 'YrSold'],
      dtype='object')

**Temporal Variables**

In [None]:
temporal_features = ['YearRemodAdd']
comparison = 'YrSold'

data['YearRemodAdd'] = data['YearRemodAdd']-data['YrSold']

**Drop Features**

In [None]:
drop_features = ["YrSold"]
data.drop(drop_features, axis=1, inplace=True)

**Log Transformations of Numerical Variable**

In [None]:
log_features = ["LotFrontage", "1stFlrSF", "GrLivArea"]
for var in log_features:
  data[var] = np.log(data[var])

**Split back to train and test**

In [None]:
train_clean = data.iloc[:train.shape[0], :]
train_clean.shape

(1460, 23)

In [None]:
test_clean = data.iloc[train.shape[0]:, :]
test_clean.shape

(1459, 23)

**Split Train Data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_clean, y, random_state=2012, shuffle=True, test_size=0.15)

In [None]:
y_train = np.log(y_train)
y_test = np.log(y_test)

**Run Model Lasso**

In [None]:
model = Lasso(alpha=0.005, random_state=0)

In [None]:
model.fit(X_train, y_train)

Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
pred = model.predict(X_test)

**Evaluation**

In [None]:
#MSE
print("MSE : ",metrics.mean_squared_error(pred, y_test))
#MAE
print("MAE : ",metrics.mean_absolute_error(pred, y_test))
#RMSE
print("RMSE : ",np.sqrt(metrics.mean_squared_error(pred, y_test)))
#R2
print("R-sq : ",metrics.r2_score(pred, y_test))

MSE :  0.024092368764334775
MAE :  0.11439919885435745
RMSE :  0.15521716646149283
R-sq :  0.7933542052441106


**Prediction on the actual Test Data**

In [None]:
#test_clean is the transformed original test data; x_test is the 15% split from training data, 
#apologies for similar names
pred_test = np.exp(model.predict(test_clean))