# HOUSE PRICE PREDICTION

## Description
- Objective: to develop a machine learning model to predict house prices.
- Data: use a housing prices dataset from Kaggle.

### 1. Import the required modules

In [1]:
import pandas as pd
import numpy as  np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

### 2. Import raw data

In [2]:
housing_data = pd.read_csv('Housing.csv')
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### 3. Defining the variables

In [3]:
X = housing_data.drop(columns='price')
y = housing_data['price']

### 4. Data preprocessing

#### 4.1 Checking for missing values

In [4]:
# starting with features that have a small range of values
for col in X.columns[1:]:
    print(col, sorted(X[col].unique()))

bedrooms [1, 2, 3, 4, 5, 6]
bathrooms [1, 2, 3, 4]
stories [1, 2, 3, 4]
mainroad ['no', 'yes']
guestroom ['no', 'yes']
basement ['no', 'yes']
hotwaterheating ['no', 'yes']
airconditioning ['no', 'yes']
parking [0, 1, 2, 3]
prefarea ['no', 'yes']
furnishingstatus ['furnished', 'semi-furnished', 'unfurnished']


In [5]:
# then the rest of the features with a wide range of values
assert(sum([type(X['area'][i])==np.int64 for i in range(X.shape[0])]) == X.shape[0])
assert(sum([type(y[i])==np.int64 for i in range(len(y))]) == len(y))

- Conclusion: No Missing Values

#### 4.2 Feature transformation
- Encode/binarize nominal variables.

In [6]:
variables_to_transform = [col for col in X.columns if X[col].dtype=='object']
variables_to_transform

['mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'prefarea',
 'furnishingstatus']

In [7]:
le = LabelEncoder()
X = X.apply(lambda x: le.fit_transform(x)
            if x.name in variables_to_transform
            else x)
X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,0
4,7420,4,1,2,1,1,1,0,1,2,0,0


### 5. Model fitting and testing
#### 5.1 Split the data into training and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

#### 5.2 Fitting three models for comparison

In [9]:
# Linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)
lr.score(X_test, y_test)

0.7235015223200351

In [10]:
# Gradient boosting model using default parameters
gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(X_train, y_train)
gbr.predict(X_test)
gbr.score(X_test, y_test)

0.565107788496124

In [11]:
# Random forest model using default parameters
rfr = RandomForestRegressor(random_state=0)
rfr.fit(X_train, y_train)
rfr.predict(X_test)
rfr.score(X_test, y_test)

0.63287304762683

In [12]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

Notes
- The linear regression model performs the best when tested against test data.
- However, we're going to improve the other models using cross validation to select the best parameters
- Then we'll select the best performing model

### 6. Model Selection
- Using CV
- Acurracy metric: r_squared

In [13]:
gbr.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [14]:
param_grid1 = {'max_depth':[1, 2, 3, 4, 5], 'min_samples_split':[18, 19, 20, 21, 22], 'n_estimators':[68, 69, 70, 71, 72]}
gscv1 = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid1,
    scoring='r2')
gscv1.fit(X_train, y_train)
gscv1.predict(X_test)
print('Train score:', gscv1.best_score_)
print('Test score:', gscv1.score(X_test, y_test))
print('Params:', gscv1.best_params_)

Train score: 0.6147410687326449
Test score: 0.6155678654501233
Params: {'max_depth': 3, 'min_samples_split': 19, 'n_estimators': 70}


In [15]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [16]:
param_grid2 = {'max_features':[2, 3, 4, 5, 6], 'min_samples_split':[2, 3, 4, 5, 6], 'n_estimators':[80, 81, 82, 83, 84]}
gscv2 = GridSearchCV(
    estimator=rfr,
    param_grid=param_grid2,
    scoring='r2')
gscv2.fit(X_train, y_train)
gscv2.predict(X_test)
print('Train Score :', gscv2.best_score_)
print('Test score:', gscv2.score(X_test, y_test))
print('Params:', gscv2.best_params_)

Train Score : 0.6061776648298477
Test score: 0.6895092729343817
Params: {'max_features': 4, 'min_samples_split': 4, 'n_estimators': 82}


Conclusion: Use a linear regression model as it has the highest score.