In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [3]:
# importing data
housing_multi = pd.read_csv('data/Housing.csv')
housing_multi.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
housing_multi.tail()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished
544,1750000,3850,3,1,2,yes,no,no,no,no,0,no,unfurnished


In [5]:
housing_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


### Data preparation
* transforming yes and no into Binary
* also do some innovation in furnished data

In [6]:
# map is an extension pf pandas
housing_multi['mainroad'] = housing_multi['mainroad'].map({'yes':1,'no':0})
housing_multi['guestroom'] = housing_multi['guestroom'].map({'yes':1,'no':0})
housing_multi['basement'] = housing_multi['basement'].map({'yes':1,'no':0})
housing_multi['hotwaterheating'] = housing_multi['hotwaterheating'].map({'yes':1,'no':0})
housing_multi['airconditioning'] = housing_multi['airconditioning'].map({'yes':1,'no':0})
housing_multi['prefarea'] = housing_multi['prefarea'].map({'yes':1,'no':0})

In [7]:
status = pd.get_dummies(housing_multi['furnishingstatus'])
status.head()


Unnamed: 0,furnished,semi-furnished,unfurnished
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0


In [8]:
# if furnished and unfurnished is 0 then semi is one
# so we can cut it off

status = pd.get_dummies(housing_multi['furnishingstatus'],drop_first=True)
status.head()


Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [9]:
housing_multi = pd.concat([housing_multi,status],axis=1)

In [10]:
housing_multi.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [11]:
# dropping the whole furnished status column since the needed data is already concatenated
housing_multi.drop(['furnishingstatus'],axis=1,inplace=True)

In [12]:
housing_multi.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


#### Adding extra variable columns that can be useful

In [13]:
housing_multi['areaPerBedroom'] = housing_multi['area']/housing_multi['bedrooms']
housing_multi['bbratio'] = housing_multi['bathrooms']/housing_multi['bedrooms']
housing_multi.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaPerBedroom,bbratio
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0,1855.0,0.5
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0,2240.0,1.0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0,3320.0,0.666667
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0,1875.0,0.5
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0,1855.0,0.25


### Rescaling the Features
cause without the rescaling the coefficents of the variables will be very large numbers

for Example:

$m_1$$x_1$ + $m_2$$x_2$ + $m_3$$x_3$ + ...

if $m_1$ is a large number it will override all the linear equation the other terms will be shadowed

$normalize$ = $x - min \over max - min$

$x\in[min,max]$

In [18]:
def normalize(x):
    minVal  = min(x)
    return (x-minVal) / (max(x) - minVal)

1750000
1750000
1650
1650
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
381.0
381.0
0.16666666666666666
0.16666666666666666


price              None
area               None
bedrooms           None
bathrooms          None
stories            None
mainroad           None
guestroom          None
basement           None
hotwaterheating    None
airconditioning    None
parking            None
prefarea           None
semi-furnished     None
unfurnished        None
areaPerBedroom     None
bbratio            None
dtype: object

In [19]:
housing_multi = housing_multi.apply(normalize)

In [27]:
all_list = np.array(housing_multi.columns)
x_list = all_list[all_list!='price'] # filtering

X = housing_multi[x_list]
y = housing_multi['price']


In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=.7,random_state=100)

### Building a linear Model

In [31]:
X_train = sm.add_constant(X_train)

# first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()

In [32]:
print(lm_1.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     53.12
Date:                Sun, 07 Feb 2021   Prob (F-statistic):           4.56e-82
Time:                        13:28:47   Log-Likelihood:                 384.40
No. Observations:                 381   AIC:                            -736.8
Df Residuals:                     365   BIC:                            -673.7
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0603      0.059     

### Checking VIF
