In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

#Import data from csv

In [2]:
housing = pd.read_csv('Housing.csv')
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
# converting all Yes/no to 0s and 1s
varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)

In [4]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [6]:
furnishing = pd.get_dummies(housing['furnishingstatus'], drop_first=True)
furnishing

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
540,0,1
541,1,0
542,0,1
543,0,0


In [7]:
# concating the dummies with original
housing = pd.concat([housing, furnishing], axis=1)
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [11]:
#dropping furnishing status
housing.drop('furnishingstatus', inplace=True, axis=1)
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [16]:
# split the model for traning and test
from sklearn.model_selection import train_test_split

In [24]:
train, test = train_test_split(housing, train_size=0.7, test_size=0.3, random_state=100)

In [25]:
# sacling train set with minmax scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# train[:] is used instead of 'train' because to avoid getting converted to numpy array
train[:] = scaler.fit_transform(train[:])

In [26]:
train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,0.169697,0.155227,0.4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,1.0
19,0.615152,0.403379,0.4,0.5,0.333333,1.0,0.0,0.0,0.0,1.0,0.333333,1.0,1.0,0.0
159,0.321212,0.115628,0.4,0.5,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
35,0.548133,0.454417,0.4,0.5,1.0,1.0,0.0,0.0,0.0,1.0,0.666667,0.0,0.0,0.0
28,0.575758,0.538015,0.8,0.5,0.333333,1.0,0.0,1.0,1.0,0.0,0.666667,0.0,0.0,1.0


In [28]:
y_train = train.pop('price')
x_train = train

In [29]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [33]:
model = LinearRegression()
#help(model.fit)
model = model.fit(x_train, y_train)

#RFE
help(RFE.fit)
rfe = RFE(model, 10)
rfe = rfe.fit(x_train, y_train)

Help on function fit in module sklearn.feature_selection._rfe:

fit(self, X, y)
    Fit the RFE model and then the underlying estimator on the selected
       features.
    
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The training input samples.
    
    y : array-like of shape (n_samples,)
        The target values.



In [37]:
list(zip(x_train.columns,rfe.support_,rfe.ranking_))

[('area', True, 1),
 ('bedrooms', True, 1),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', False, 3),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', True, 1),
 ('prefarea', True, 1),
 ('semi-furnished', False, 4),
 ('unfurnished', False, 2)]

In [38]:
col = x_train.columns[rfe.support_]
col

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'hotwaterheating', 'airconditioning', 'parking', 'prefarea'],
      dtype='object')

In [40]:
x_train_rfe = x_train[col]

In [41]:
import statsmodels.api as sm 

In [44]:
help(sm.add_constant)
x_train_rfe = sm.add_constant(x_train_rfe)
x_train_rfe

Help on function add_constant in module statsmodels.tools.tools:

add_constant(data, prepend=True, has_constant='skip')
    Add a column of ones to an array.
    
    Parameters
    ----------
    data : array_like
        A column-ordered design matrix.
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data`` already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if any column has a constant value. Using 'add' will add a
        column of 1s if a constant column is present.
    
    Returns
    -------
    array_like
        The original values with a constant (column of ones) as the first or
        last column. Returned value type depends on input type.
    
    Notes
    -----
    When the input is recarray or a pandas Series or DataFrame, the added
    column

Unnamed: 0,const,area,bedrooms,bathrooms,stories,mainroad,guestroom,hotwaterheating,airconditioning,parking,prefarea
359,1.0,0.155227,0.4,0.0,0.000000,1.0,0.0,0.0,0.0,0.333333,0.0
19,1.0,0.403379,0.4,0.5,0.333333,1.0,0.0,0.0,1.0,0.333333,1.0
159,1.0,0.115628,0.4,0.5,0.000000,1.0,1.0,0.0,1.0,0.000000,0.0
35,1.0,0.454417,0.4,0.5,1.000000,1.0,0.0,0.0,1.0,0.666667,0.0
28,1.0,0.538015,0.8,0.5,0.333333,1.0,0.0,1.0,0.0,0.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...
526,1.0,0.118268,0.2,0.0,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0
53,1.0,0.291623,0.4,0.5,1.000000,1.0,0.0,0.0,1.0,0.666667,0.0
350,1.0,0.139388,0.2,0.0,0.333333,1.0,0.0,1.0,0.0,0.333333,0.0
79,1.0,0.366420,0.4,0.5,0.666667,1.0,1.0,0.0,1.0,0.000000,0.0


In [50]:
statmodel = sm.OLS(y_train,x_train_rfe).fit()

In [51]:
statmodel.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.66
Method:,Least Squares,F-statistic:,74.89
Date:,"Fri, 22 Oct 2021",Prob (F-statistic):,1.2799999999999999e-82
Time:,16:32:59,Log-Likelihood:,374.65
No. Observations:,381,AIC:,-727.3
Df Residuals:,370,BIC:,-683.9
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0027,0.018,0.151,0.880,-0.033,0.038
area,0.2363,0.030,7.787,0.000,0.177,0.296
bedrooms,0.0661,0.037,1.794,0.074,-0.006,0.139
bathrooms,0.1982,0.022,8.927,0.000,0.155,0.242
stories,0.0977,0.019,5.251,0.000,0.061,0.134
mainroad,0.0556,0.014,3.848,0.000,0.027,0.084
guestroom,0.0381,0.013,2.934,0.004,0.013,0.064
hotwaterheating,0.0897,0.022,4.104,0.000,0.047,0.133
airconditioning,0.0711,0.011,6.235,0.000,0.049,0.093

0,1,2,3
Omnibus:,86.105,Durbin-Watson:,2.098
Prob(Omnibus):,0.0,Jarque-Bera (JB):,286.069
Skew:,0.992,Prob(JB):,7.6e-63
Kurtosis:,6.753,Cond. No.,13.2


In [48]:
#dropping bedrooms
#Bedrooms is insignificant in presence of other variables; can be dropped


ValueError: shapes (381,11) and (381,11) not aligned: 11 (dim 1) != 381 (dim 0)

In [52]:
x_train_rfe.pop('bedrooms')

359    0.4
19     0.4
159    0.4
35     0.4
28     0.8
      ... 
526    0.2
53     0.4
350    0.2
79     0.4
520    0.2
Name: bedrooms, Length: 381, dtype: float64

In [53]:
x_train_rfe

Unnamed: 0,const,area,bathrooms,stories,mainroad,guestroom,hotwaterheating,airconditioning,parking,prefarea
359,1.0,0.155227,0.0,0.000000,1.0,0.0,0.0,0.0,0.333333,0.0
19,1.0,0.403379,0.5,0.333333,1.0,0.0,0.0,1.0,0.333333,1.0
159,1.0,0.115628,0.5,0.000000,1.0,1.0,0.0,1.0,0.000000,0.0
35,1.0,0.454417,0.5,1.000000,1.0,0.0,0.0,1.0,0.666667,0.0
28,1.0,0.538015,0.5,0.333333,1.0,0.0,1.0,0.0,0.666667,0.0
...,...,...,...,...,...,...,...,...,...,...
526,1.0,0.118268,0.0,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0
53,1.0,0.291623,0.5,1.000000,1.0,0.0,0.0,1.0,0.666667,0.0
350,1.0,0.139388,0.0,0.333333,1.0,0.0,1.0,0.0,0.333333,0.0
79,1.0,0.366420,0.5,0.666667,1.0,1.0,0.0,1.0,0.000000,0.0


In [54]:
statmodel = sm.OLS(y_train,x_train_rfe).fit()
statmodel.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.666
Model:,OLS,Adj. R-squared:,0.658
Method:,Least Squares,F-statistic:,82.37
Date:,"Fri, 22 Oct 2021",Prob (F-statistic):,6.67e-83
Time:,16:35:32,Log-Likelihood:,373.0
No. Observations:,381,AIC:,-726.0
Df Residuals:,371,BIC:,-686.6
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0242,0.013,1.794,0.074,-0.002,0.051
area,0.2367,0.030,7.779,0.000,0.177,0.297
bathrooms,0.2070,0.022,9.537,0.000,0.164,0.250
stories,0.1096,0.017,6.280,0.000,0.075,0.144
mainroad,0.0536,0.014,3.710,0.000,0.025,0.082
guestroom,0.0390,0.013,2.991,0.003,0.013,0.065
hotwaterheating,0.0921,0.022,4.213,0.000,0.049,0.135
airconditioning,0.0710,0.011,6.212,0.000,0.049,0.094
parking,0.0669,0.018,3.665,0.000,0.031,0.103

0,1,2,3
Omnibus:,91.542,Durbin-Watson:,2.107
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315.402
Skew:,1.044,Prob(JB):,3.25e-69
Kurtosis:,6.938,Cond. No.,10.0


In [55]:
#calculating VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
x_train_rfe.pop('const')

359    1.0
19     1.0
159    1.0
35     1.0
28     1.0
      ... 
526    1.0
53     1.0
350    1.0
79     1.0
520    1.0
Name: const, Length: 381, dtype: float64

In [61]:
x_train_rfe
vif = pd.DataFrame()

vif['features'] = x_train_rfe.columns

#help(variance_inflation_factor)
for i in range(x_train_rfe.shape[1]):
    print(i)
    
x_train_rfe.values

0
1
2
3
4
5
6
7
8


array([[0.15522703, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.        ],
       [0.40337909, 0.5       , 0.33333333, ..., 1.        , 0.33333333,
        1.        ],
       [0.1156283 , 0.5       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.13938754, 0.        , 0.33333333, ..., 0.        , 0.33333333,
        0.        ],
       [0.36642027, 0.5       , 0.66666667, ..., 1.        , 0.        ,
        0.        ],
       [0.51601549, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])