# Housing dataset

In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression , ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [112]:
housing = pd.read_csv('Housing.csv')

In [113]:
housing.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [114]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    int64  
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 51.3+ KB


In [115]:
X = housing[['driveway']]  
y = housing['price']

### One Hot Encoding

.get_dummies() from *pandas*

#### one - way of doing one hot encoding

In [116]:
X['driveway'].values

array(['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes',
       'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no',
       'no', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'no', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no',
       'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes', 'no',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',

In [117]:
dum_X = pd.get_dummies(X)
# dum_X = pd.get_dummies(X,drop_first=True) # Drop one column because we don't need two here
print(dum_X['driveway_no'].value_counts())
print(dum_X['driveway_yes'].value_counts())
# here it seems yo be boolen but we can tak eit as it is for the calculations 

driveway_no
False    469
True      77
Name: count, dtype: int64
driveway_yes
True     469
False     77
Name: count, dtype: int64


In [118]:
dum_X = pd.get_dummies(X,drop_first=True)
lr = LinearRegression()
lr.fit(dum_X,y)
print(lr.intercept_,lr.coef_)  # price = 48555.77922 + 22778.116 * driveway_yes
# If driveway = yes , price = 48555.77922 + 22778.1163016 = 71333.895522
# If driveway = no , price = 48555.77922 + 0 = 48555.77922


48555.77922077922 [22778.11630161]


In [119]:
housing.groupby('driveway')['price'].mean()

driveway
no     48555.779221
yes    71333.895522
Name: price, dtype: float64

In [120]:
dum_X = pd.get_dummies(X)
print(dum_X.head())
dum_X.drop('driveway_yes',axis=1,inplace=True)
print(dum_X.head())
lr = LinearRegression()
lr.fit(dum_X,y)
print(lr.intercept_,lr.coef_)
# If driveway = no , price = 48555.779220 - 22778.11608*1 = 71333.895522
# If driveway = yes , price = 48555.779220 - 22778.11608*0 = 48555.77922077

   driveway_no  driveway_yes
0        False          True
1        False          True
2        False          True
3        False          True
4        False          True
   driveway_no
0        False
1        False
2        False
3        False
4        False
71333.89552238806 [-22778.11630161]


### OneHotEncoder class from scikit

In [121]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')

In [122]:
dum_X = ohe.fit_transform(X)
lr = LinearRegression()
lr.fit(dum_X,y)
print(lr.intercept_,lr.coef_)

48555.77922077922 [22778.11630161]


In [123]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    int64  
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(5), object(6)
memory usage: 51.3+ KB


Two Columns *driveway* and *airco*

In [124]:
X = housing[['driveway','airco']]
y = housing['price']

In [125]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
dum_X = ohe.fit_transform(X)
lr = LinearRegression()
lr.fit(dum_X,y)
print(lr.intercept_,lr.coef_)

43790.67032031879 [19302.6687155 24460.8923557]


In [126]:
dum_X.columns

Index(['driveway_yes', 'airco_yes'], dtype='object')

### With numeric column

In [127]:
X = housing[['driveway','airco','bedrooms']]
y = housing['price']

In [128]:
housing['bedrooms'].unique()

array([3, 2, 4, 1, 5, 6], dtype=int64)

'pd.get_dummies()' does the hot encoding of only 'object' type variables

In [129]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
dum_X = ohe.fit_transform(X)


In [130]:
dum_X.columns

Index(['driveway_yes', 'airco_yes', 'bedrooms_2', 'bedrooms_3', 'bedrooms_4',
       'bedrooms_5', 'bedrooms_6'],
      dtype='object')

In [131]:
X = housing[['driveway','airco','lotsize']]
y = housing['price']

In [132]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
dum_X = ohe.fit_transform(X)


In [133]:
dum_X

Unnamed: 0,driveway_yes,airco_yes,lotsize_1700,lotsize_1836,lotsize_1905,lotsize_1950,lotsize_2000,lotsize_2015,lotsize_2135,lotsize_2145,...,lotsize_11175,lotsize_11410,lotsize_11440,lotsize_11460,lotsize_12090,lotsize_12900,lotsize_12944,lotsize_13200,lotsize_15600,lotsize_16200
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
542,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
543,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
544,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Column Transformers

In [134]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

In [135]:
str_cols = list(housing.columns[housing.dtypes==object])
num_cols = list(housing.columns[housing.dtypes!=object])
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')

In [136]:
ct = make_column_transformer(('passthrough',num_cols),(ohe,str_cols),
                             verbose_feature_names_out=False).set_output(transform='pandas')

In [137]:
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


Using make_column_selector()

In [138]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object) ),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False).set_output(transform='pandas')
ct.fit_transform(housing)

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl,driveway_yes,recroom_yes,fullbase_yes,gashw_yes,airco_yes,prefarea_yes
0,42000.0,5850,3,1,2,1,1.0,0.0,1.0,0.0,0.0,0.0
1,38500.0,4000,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
2,49500.0,3060,3,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,60500.0,6650,3,1,2,0,1.0,1.0,0.0,0.0,0.0,0.0
4,61000.0,6360,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,0,1.0,1.0,0.0,0.0,1.0,0.0
542,94000.0,6000,3,2,4,0,1.0,0.0,0.0,0.0,1.0,0.0
543,103000.0,6000,3,2,4,1,1.0,1.0,0.0,0.0,1.0,0.0
544,105000.0,6000,3,2,2,1,1.0,1.0,0.0,0.0,1.0,0.0


Using hot encoding with train test split

In [139]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object) ),
                            (ohe,make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out = False).set_output(transform='pandas')
X = housing.drop('price',axis=1)
y = housing['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [140]:
X_ohe_trn = ct.fit_transform(X_train)
X_ohe_tst = ct.transform(X_test)
lr.fit(X_ohe_trn,y_train)
y_pred = lr.predict(X_ohe_tst)
r2_score(y_test,y_pred)

0.6246856191453717

using hot encoding with pipeling 

In [141]:
poly = PolynomialFeatures(degree = 3, include_bias=False).set_output(transform='pandas')
lr = LinearRegression()
pipe = Pipeline([('CT',ct),('LR',lr)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.6246856191453717

**One Hot Encoding & Polynomial Transformation**

In [142]:
# here we will do One hot encoding first then Polynomial transformation and then Linear regression
poly = PolynomialFeatures(degree = 2, include_bias=False).set_output(transform='pandas')
lr = LinearRegression()
pipe = Pipeline([('CT',ct),('POLY',poly),('LR',lr)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.5558314095911471

#### Elastic Regresssion

In [143]:
ohe = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
ct = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object) ),
                            (ohe,make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out = False).set_output(transform='pandas')
X = housing.drop('price',axis=1)
y = housing['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [144]:
alphas = np.linspace(0.0001,10,20)
l1 = np.linspace(0.001,1,10)
scores = []
for a in alphas:
    for i in l1:
        elastic = ElasticNet(alpha=a,l1_ratio=i)
        pipe = Pipeline([('CT',ct),('POLY',poly),('ER',elastic)])
        elastic.fit(X_train,y_train)
        y_pred = elastic.predict(X_test)
        scores.append([a,i,r2_score(y_test,y_pred)])
        
# df_scores = pd.DataFrame(scores,columns=['alpha','l1_ratio','score'])
# df_scores.sort_values('score',ascending=False,inplace=True)
# best_a = df_scores['alpha'].iloc[0]
# best_l1 = df_scores['score'].iloc[0]
# best_sc = df_scores['l1_ratio'].iloc[0]
# print("Best Alpha ",best_a)
# print("Best l1 ratio ",best_l1)
# print("Best Score ",best_sc)

ValueError: could not convert string to float: 'no'