In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [3]:
dataset = pd.read_csv('house_price_data.csv')

In [4]:
dataset

Unnamed: 0,Square_Feet,Bedrooms,Location,Price
0,1200,3,Urban,250000
1,1500,4,Suburban,300000
2,1800,3,Urban,320000
3,2000,4,Urban,400000
4,2500,5,Suburban,500000
5,3000,4,Urban,550000
6,3500,6,Suburban,600000


In [5]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
print(x)   #separate independent and dependent variables
print(y)

[[1200 3 'Urban']
 [1500 4 'Suburban']
 [1800 3 'Urban']
 [2000 4 'Urban']
 [2500 5 'Suburban']
 [3000 4 'Urban']
 [3500 6 'Suburban']]
[250000 300000 320000 400000 500000 550000 600000]


In [7]:
# I have to encode the data further, before that I need to get the categorical data
# Location is the categorical data
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
# LabelEncoder is needed to encode dependent variable.In this project, we don't need to encode y.

In [8]:
print(x)
print(y)

[[0.0 1.0 1200 3]
 [1.0 0.0 1500 4]
 [0.0 1.0 1800 3]
 [0.0 1.0 2000 4]
 [1.0 0.0 2500 5]
 [0.0 1.0 3000 4]
 [1.0 0.0 3500 6]]
[250000 300000 320000 400000 500000 550000 600000]


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)

In [10]:
print(y_train)
print(y_test)

[550000 320000 500000 400000 600000]
[250000 300000]


In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 2:] = sc.fit_transform(x_train[:, 2:])
x_test[:, 2:] = sc.transform(x_test[:, 2:])

In [12]:
print(x_train)

[[0.0 1.0 0.7006227308283302 -0.3922322702763685]
 [0.0 1.0 -1.2101665350671158 -1.3728129459672889]
 [1.0 0.0 -0.0955394632947723 0.5883484054145518]
 [0.0 1.0 -0.8917016574178748 -0.3922322702763685]
 [1.0 0.0 1.4967849249514327 1.5689290811054721]]


In [13]:
print(x_test)

[[0.0 1.0 -2.1655611680148388 -1.3728129459672889]
 [1.0 0.0 -1.6878638515409772 -0.3922322702763685]]


In [15]:
# Training the model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [22]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))

[[266956.52 250000.  ]
 [318478.26 300000.  ]]


In [25]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
print(f"r2 score: {r2}")
print(f"mse score: {mse}")

r2 score: 0.5261087076501365
mse score: 314484877.1266483


In [26]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate variance of house prices
house_prices = np.array([250000, 300000, 320000, 400000, 500000, 550000, 600000])
variance = np.var(house_prices)
print(f"Variance of House Prices: {variance}")

Root Mean Squared Error: 17733.721468621534
Variance of House Prices: 15620408163.265306
