In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [3]:
housing =pd.read_csv("/home/indra/Documents/Python program/Scaler Notes/Module-6/Modue64_RFE/Housing.csv")

In [4]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
housing['mainroad']=housing['mainroad'].map({'yes':1, 'no':0})
housing['guestroom']=housing['guestroom'].map({'yes':1, 'no':0})
housing['basement']=housing['basement'].map({'yes':1, 'no':0})
housing['hotwaterheating']=housing['hotwaterheating'].map({'yes':1, 'no':0})
housing['airconditioning']=housing['airconditioning'].map({'yes':1, 'no':0})
housing['prefarea']=housing['prefarea'].map({'yes':1, 'no':0})

In [6]:
status=pd.get_dummies(housing['furnishingstatus'],drop_first=True)


In [7]:
status.head()

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [8]:
housing=pd.concat([housing,status],axis=1)

In [9]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [10]:
housing.drop(['furnishingstatus'],axis=1,inplace=True)

In [11]:
housing['areaperbedroom']=housing['area']/housing['bedrooms']
housing['bbratio']=housing['bathrooms']/housing['bedrooms']


In [12]:
def normalize(x):
    return ((x-np.min(x))/(max(x)-min(x)))

housing=housing.apply(normalize)


In [13]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
0,1.0,0.396564,0.6,0.333333,0.666667,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,0.0,0.0,0.237016,0.4
1,0.909091,0.502405,0.6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.298923,1.0
2,0.909091,0.571134,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,1.0,0.0,0.472584,0.6
3,0.906061,0.402062,0.6,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.240232,0.4
4,0.836364,0.396564,0.6,0.0,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,0.0,0.0,0.237016,0.1


In [14]:
housing.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio'],
      dtype='object')

In [15]:
x=housing[['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio']]

y=housing['price']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.700, test_size=0.3, random_state=100)

In [17]:
import statsmodels.api as sm
x_train_sm=x_train


In [18]:
x_train_sm=sm.add_constant(x_train_sm)


In [19]:
x_train_sm.head()

Unnamed: 0,const,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
359,1.0,0.134021,0.4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,1.0,0.131693,0.2
19,1.0,0.327835,0.4,0.333333,0.333333,1.0,0.0,0.0,0.0,1.0,0.333333,1.0,1.0,0.0,0.282843,0.6
159,1.0,0.103093,0.4,0.333333,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.107574,0.6
35,1.0,0.367698,0.4,0.333333,1.0,1.0,0.0,0.0,0.0,1.0,0.666667,0.0,0.0,0.0,0.31393,0.6
28,1.0,0.43299,0.8,0.333333,0.333333,1.0,0.0,1.0,1.0,0.0,0.666667,0.0,0.0,1.0,0.194404,0.28


In [20]:
from sklearn.feature_selection import RFE


In [21]:
lm=LinearRegression()
x_train.shape

(381, 15)

In [22]:
rfe=RFE(lm,n_features_to_select=9)

In [23]:
rfe=rfe.fit(x_train,y_train)
print(rfe.support_)
print(rfe.ranking_)


[ True False  True  True  True False False  True  True False  True False
 False  True  True]
[1 3 1 1 1 4 6 1 1 2 1 7 5 1 1]


In [24]:
col=x_train.columns[rfe.support_]
col

Index(['area', 'bathrooms', 'stories', 'mainroad', 'hotwaterheating',
       'airconditioning', 'prefarea', 'areaperbedroom', 'bbratio'],
      dtype='object')