In [214]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [215]:
dataset = pd.read_csv('Housing.csv')

for i, ele in enumerate(dataset):
    print(f'{i}: {ele}')
X = dataset.drop(columns=['prefarea','guestroom','hotwaterheating','basement']).iloc[:, 1:]
y = dataset.iloc[:, 0].values

y = y.reshape(len(y),1)

0: price
1: area
2: bedrooms
3: bathrooms
4: stories
5: mainroad
6: guestroom
7: basement
8: hotwaterheating
9: airconditioning
10: parking
11: prefarea
12: furnishingstatus


In [216]:
y

array([[13300000],
       [12250000],
       [12250000],
       [12215000],
       [11410000],
       [10850000],
       [10150000],
       [10150000],
       [ 9870000],
       [ 9800000],
       [ 9800000],
       [ 9681000],
       [ 9310000],
       [ 9240000],
       [ 9240000],
       [ 9100000],
       [ 9100000],
       [ 8960000],
       [ 8890000],
       [ 8855000],
       [ 8750000],
       [ 8680000],
       [ 8645000],
       [ 8645000],
       [ 8575000],
       [ 8540000],
       [ 8463000],
       [ 8400000],
       [ 8400000],
       [ 8400000],
       [ 8400000],
       [ 8400000],
       [ 8295000],
       [ 8190000],
       [ 8120000],
       [ 8080940],
       [ 8043000],
       [ 7980000],
       [ 7962500],
       [ 7910000],
       [ 7875000],
       [ 7840000],
       [ 7700000],
       [ 7700000],
       [ 7560000],
       [ 7560000],
       [ 7525000],
       [ 7490000],
       [ 7455000],
       [ 7420000],
       [ 7420000],
       [ 7420000],
       [ 735

In [217]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,airconditioning,parking,furnishingstatus
0,7420,4,2,3,yes,yes,2,furnished
1,8960,4,4,4,yes,yes,3,furnished
2,9960,3,2,2,yes,no,2,semi-furnished
3,7500,4,2,2,yes,yes,3,furnished
4,7420,4,1,2,yes,yes,2,furnished
...,...,...,...,...,...,...,...,...
540,3000,2,1,1,yes,no,2,unfurnished
541,2400,3,1,1,no,no,0,semi-furnished
542,3620,2,1,1,yes,no,0,unfurnished
543,2910,3,1,1,no,no,0,furnished


In [218]:
# import preprocessing library
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['mainroad','airconditioning','furnishingstatus'])], remainder='passthrough')

# apply encoding
X = np.array(ct.fit_transform(X))

In [219]:
print(X)
# Encoded hotwaterheating and airconditioning column

[[0. 1. 0. ... 2. 3. 2.]
 [0. 1. 0. ... 4. 4. 3.]
 [0. 1. 1. ... 2. 2. 2.]
 ...
 [0. 1. 1. ... 1. 1. 0.]
 [1. 0. 1. ... 1. 1. 0.]
 [0. 1. 1. ... 1. 2. 0.]]


In [220]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [221]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [222]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [223]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 7076701.7   7070000.  ]
 [ 6436574.38  4550000.  ]
 [ 7261627.37  5600000.  ]
 [ 4512636.16  6125000.  ]
 [ 4694005.57  2940000.  ]
 [ 3342625.67  3780000.  ]
 [ 2410884.8   2450000.  ]
 [ 7204727.17  7490000.  ]
 [ 2659823.2   2940000.  ]
 [ 5022959.89  6440000.  ]
 [ 3427975.98  2800000.  ]
 [ 3876065.11  3325000.  ]
 [ 4857593.66  5810000.  ]
 [ 5028294.28  5950000.  ]
 [ 5732434.33  6895000.  ]
 [ 8072455.31 13300000.  ]
 [ 4820252.9   3640000.  ]
 [ 3961415.42  2870000.  ]
 [ 5981372.74  5460000.  ]
 [ 2830523.82  1890000.  ]
 [ 3968527.94  3780000.  ]
 [ 5803559.59  5600000.  ]
 [ 5049631.86  2660000.  ]
 [ 3214600.21  3640000.  ]
 [ 5284345.21  2590000.  ]
 [ 5938697.58  5250000.  ]
 [ 2944324.23  2450000.  ]
 [ 7574578.51  6930000.  ]
 [ 6877550.98  9800000.  ]
 [ 4078772.09  3430000.  ]
 [ 7432327.99  7840000.  ]
 [ 7005576.45  5950000.  ]
 [ 4092997.14  3430000.  ]
 [ 4253028.97  2940000.  ]
 [ 6336999.02  6090000.  ]
 [ 3556001.45  4620000.  ]
 [ 5718209.28  4543000.  ]
 

In [224]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6224930188023717