# Housing Sale Prediction

In [226]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

## 1. Data Import and Basic Preprocessing

### 1.1 Data Loading and Check for NAN

In [227]:
train_df = pd.read_csv('../data/housing/train.csv', keep_default_na=False, index_col='Id')
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [228]:
train_df.dtypes.unique()

array([dtype('int64'), dtype('O')], dtype=object)

In [229]:
train_df.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64

In [230]:
test_df = pd.read_csv('../data/housing/test.csv', keep_default_na=False, index_col='Id')
test_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [231]:
len(test_df)

1459

In [232]:
test_df.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 79, dtype: int64

In [233]:
test_df.dtypes

MSSubClass        int64
MSZoning         object
LotFrontage      object
LotArea           int64
Street           object
                  ...  
MiscVal           int64
MoSold            int64
YrSold            int64
SaleType         object
SaleCondition    object
Length: 79, dtype: object

### 1.3 Sense Checks of Data

First step is to make sure that numerical columns are all integers and dont contain NA values.

In [283]:
actual_numerical_cols = ['LotFrontage', 'LotArea', 'Fireplaces', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                         '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
                         'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
                         'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

In [288]:
incorrect_numerical_types = [cname for cname in actual_numerical_cols if train_df[cname].dtype == object or test_df[cname].dtype == object]
incorrect_numerical_types

NameError: name 'cname' is not defined

## 2. Initial Model Training

Intend to use a simple pipeline with numerical and categorical encoder to get initial MAE, then look to improve model results.

In [234]:
def y_and_train_test_split(df: pd.DataFrame, y_column: str, train_size: float):
    """_summary_

    Args:
        df (pd.DataFrame): _description_
        train_size (float): _description_
    """
    test_size = 1.0 - train_size
    X_final = df.drop(y_column, axis=1)
    y = df[y_column]
    return train_test_split(X_final, y,
                            train_size=train_size, test_size=test_size,
                            random_state=0)

### 2.1 Create test_train split

In [235]:
y_column = 'SalePrice'
X_train, X_valid, y_train, y_valid = y_and_train_test_split(train_df, y_column, 0.8)

### 2.2 Generating Categorical and Numerical col_names

These will be passed to the Pipeline encoders.

In [236]:
categorical_cols = [cname for cname in X_train.columns
                    if X_train[cname].nunique() < 10 and
                    X_train[cname].dtype == "object" and
                    test_df[cname].nunique() < 10 and
                    test_df[cname].dtype == "object"]

In [237]:
numerical_cols = [cname for cname in X_train.columns
                  if X_train[cname].dtype in ['int64', 'float64'] and
                  test_df[cname].dtype in ['int64', 'float64']]

In [238]:
full_cols = categorical_cols + numerical_cols
X_train = X_train[full_cols].copy()
X_valid = X_valid[full_cols].copy()
X_test = test_df[full_cols].copy()

### 2.3 Preprocess and Train Model

Define the numerical and categorical encoders.

In [239]:
numerical_transformer = SimpleImputer(strategy='constant')


In [240]:
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [241]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [242]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(
            n_estimators=100,
            random_state=0))
    ]
)

In [243]:
clf.fit(X_train, y_train)

In [244]:
preds = clf.predict(X_valid)

In [245]:
mea = mean_absolute_error(y_valid, preds)
mea

18033.45421232877

In [246]:
rmse = root_mean_squared_error(y_valid, preds)
rmse

33924.902596776206

### 2.4 Generate Initial Predictions

In [247]:
def output_preds(X_test: pd.DataFrame, preds_test: pd.DataFrame, version: str):
    output = pd.DataFrame({
        'Id': X_test.index,
        'SalePrice': preds_test
    })
    output.to_csv(f"submission_{version}.csv", index=False)

In [248]:
preds_test = clf.predict(X_test)

In [249]:
output_preds(X_test, preds_test, 'v1')

## 3. Hyperparameter Tuning

In [250]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Will start using GridSearchCV to determine the best hyperparameters.

In [251]:
param_grid = {
    'model__n_estimators': [50, 60, 70, 80, 90, 100, 125]
}

In [252]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(random_state=0))
    ]
)

In [253]:
grid_search = GridSearchCV(clf,
                           param_grid=param_grid,
                           n_jobs=2)

In [254]:
#grid_search.fit(X_train, y_train)
#grid_search.best_params_

In [255]:

clf_optimized = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(
            random_state=0,
            n_estimators=100
        ))
    ]
)

In [256]:
clf_optimized.fit(X_train, y_train)

In [257]:
preds_2 = clf_optimized.predict(X_valid)

In [258]:
mea_2 = mean_absolute_error(y_valid, preds_2)
mea_2

18033.45421232877

In [259]:
rmse_2 = root_mean_squared_error(y_valid, preds_2)
rmse_2

33924.902596776206

## 3. Improved Data Preprocessing

In [260]:
def create_categorical_cols(train: pd.DataFrame,
                            test: pd.DataFrame,
                            cols_to_ignore: list[str]) -> list[str]:
    """_summary_

    Args:
        df (pd.Dataframe): _description_
        cols_to_ignore (list[str]): _description_

    Returns:
        list[str]: _description_
    """
    return [cname for cname in train.columns
            if cname not in cols_to_ignore and
            train[cname].nunique() < 10 and
            train[cname].dtype == "object" and
            test[cname].nunique() < 10 and
            test[cname].dtype == "object"]

def create_numerical_cols(train: pd.DataFrame,
                          test: pd.DataFrame,
                          cols_to_ignore: list[str]) -> list[str]:
    """_summary_

    Args:
        train (pd.DataFrame): _description_
        test (pd.DataFrame): _description_
    
    Returns:
        list[str]: _description_
    """
    return [cname for cname in train.columns
            if cname not in cols_to_ignore and
            train[cname].dtype in ['int64', 'float64'] and
            test[cname].dtype in ['int64', 'float64']]

### 3.1 Ordinal Encoding

There appears to be quite alot of ordinal data which isn't being encoded properly. I will not add a preprocessor stage for these columns in an attempt to improve model accuracy. 

There are some columns which have Ordinal Rankings which are dependant on other factors, such as BsmtQual, which ranks Po to Ex if there is a basement, or NA if a basement doesn't exist. I intend to include these in ordinal_columns for this regression and will review the results.

I am also going to have to 

In [261]:
y_column = 'SalePrice'
X_train, X_valid, y_train, y_valid = y_and_train_test_split(train_df, y_column, 0.8)

In [262]:
ordinal_cols = ['LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond']
#potential_ordinal_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure',
         #       'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
         #       'PoolQC', 'Fence']

In [263]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_transformer = Pipeline(
    steps=[
        ('ordinal', OrdinalEncoder())
    ]
)

### 3.2 Recreate categorical_cols and numerical_cols

In [264]:
categorical_cols = create_categorical_cols(X_train, test_df, ordinal_cols)

In [265]:
numerical_cols = create_numerical_cols(X_train, test_df, ordinal_cols)

In [266]:
full_cols = ordinal_cols + categorical_cols + numerical_cols
X_train = X_train[full_cols].copy()
X_valid = X_valid[full_cols].copy()
X_test = test_df[full_cols].copy()

## 4. Updated Model Training

### 4.1 New Preprocessor and Model Definition

In [267]:
preprocessor_updated = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('ord', ordinal_transformer, ordinal_cols)
    ]
)

In [268]:
clf_updated = Pipeline(
    steps=[
        ('preprocessor', preprocessor_updated),
        ('model', RandomForestRegressor(
            n_estimators=100,
            random_state=0))
    ]
)

### 4.2 Training and Evaluation

In [269]:
clf_updated.fit(X_train, y_train)

In [270]:
pred_updated = clf_updated.predict(X_valid)

In [271]:
mea_updated = mean_absolute_error(y_valid, pred_updated)
mea_updated

18003.78462328767

In [272]:
rmse_updated = root_mean_squared_error(y_valid, pred_updated)
rmse_updated

33744.95664947618

### 4.3 Model Comparison

In [273]:
model_comparison = pd.DataFrame([['v1', mea, rmse], ['v2', mea_updated, rmse_updated]], columns=['model_version', 'mea', 'rmse'])
model_comparison

Unnamed: 0,model_version,mea,rmse
0,v1,18033.454212,33924.902597
1,v2,18003.784623,33744.956649


### 4.4 Model Evaluation

In [275]:
missing_cols = [cname for cname in test_df.columns if cname not in full_cols]
missing_cols

['LotFrontage',
 'Neighborhood',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'SaleType']

## Conclusion



### Evaluation Points

* 1. Binary Encoding: accuracy of model may improve by adding binary encoding of certain values, containsAir, containsPool etc.