# House Price prediction

## Building the model

### Import Dataset

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Read the data
df = pd.read_csv('train.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
df.dropna(axis=0, subset=['SalePrice'], inplace=True)

X_full =df.copy()
y = X_full.pop('SalePrice')

### Data preprocessing

In [10]:
def clean_data(df):
    # Replace missing values with "Nan" in columns: 'PoolQC', 'FireplaceQu'
    df = df.fillna({'PoolQC': "Nan", 'FireplaceQu': "Nan"})
    # Drop columns: 'MiscVal', 'MiscFeature' and 2 other columns
    df = df.drop(columns=['MiscVal', 'MiscFeature', 'Alley', 'Fence'])
    # Change column type to object for columns: 'YrSold', 'BsmtFullBath' and 6 other columns
    df = df.astype({'YrSold': 'object', 'BsmtFullBath': 'object', 'BsmtHalfBath': 'object', 'FullBath': 'object', 'HalfBath': 'object', 'KitchenAbvGr': 'object', 'GarageCars': 'object', 'Fireplaces': 'object'})
    return df

X = clean_data(X_full)
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,Nan,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,Nan,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,Nan,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,Nan,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,Nan,12,2008,WD,Normal


### Feature Engineering

In [48]:
from sklearn.preprocessing import FunctionTransformer

def feature_engineering(df):
  df['HouseAge'] = df['YrSold'] - df['YearBuilt']
  df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
  df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
  

  return df

feature_eng = FunctionTransformer(feature_engineering)
X_fe = feature_eng.fit_transform(X)
X_fe.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,MoSold,YrSold,SaleType,SaleCondition,HouseAge,RemodAge,TotalSF
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,Nan,2,2008,WD,Normal,5,5,2566
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,Nan,5,2007,WD,Normal,31,31,2524
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,Nan,9,2008,WD,Normal,7,6,2706
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,Nan,2,2006,WD,Abnorml,91,36,2473
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,Nan,12,2008,WD,Normal,8,8,3343


### Define Pipelines

In [129]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE

# Base Model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100,
                              random_state=1
                              )

mi_selector = SelectKBest(score_func=mutual_info_regression, k=300)

# Transformers

num_cols = X_fe.select_dtypes(exclude='object').columns.tolist()
cat_cols = X_fe.select_dtypes(include='object').columns.tolist()

numerical_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MinMaxScaler())
    ])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

column_transformer = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols),
    ],sparse_threshold=0, remainder='passthrough')


#Preprosessor
preprosessor = Pipeline(steps=[
    ('feature_eng', feature_eng),
    ('transform', column_transformer,)
])

#pipeline
ModelPipeline = Pipeline(steps=[
    ('preprosessor', preprosessor),
    ('feateSelection', mi_selector),
    ('model', model)
])


In [106]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2, random_state=1)

X_train_preprocessed = preprosessor.fit_transform(X_train)
X_valid_preprocessed = preprosessor.transform(X_valid)

### Train model

In [141]:
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_score

ModelPipeline.fit(X_train,y_train)
preds = ModelPipeline.predict(X_valid)
mae = mean_absolute_error(preds, y_valid)
r2 = r2_score(preds, y_valid)
print(f'mae: {mae}')
print(f'r2: {r2}')

mae: 16375.162876712331
r2: 0.8564633588132365


## Tune the model

### Finding optimal parameters

In [79]:

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [20, 50, 100],        # Number of trees
    'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],        # Minimum samples required in a leaf node
    'bootstrap': [True, False]            # Whether bootstrap samples are used
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train_preprocessed,y_train)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid_preprocessed)

accuracy = mean_absolute_error(y_valid, y_pred)
print(f'mae: {accuracy}')

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
mae: 16856.645685450963


### Test differend models