In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

#Load the data
data = pd.read_csv("real_estate_price_size_year.csv")
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [2]:
#5 number summary
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


In [3]:
#Creating the multi-linear regression model
x = data[["size", "year"]]
y = data["price"]
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [4]:
#Coefficients
reg.coef_

array([ 227.70085401, 2916.78532684])

In [5]:
#Intercept
reg.intercept_

-5772267.017463277

In [6]:
#Finding the R-Squared
reg.score(x,y)

0.7764803683276794

In [7]:
x.shape

(100, 2)

In [8]:
#Calculating Adjusted R-Squared
r2 = reg.score(x,y)
n = x.shape[0] #Number of observations from x.shape
p = x.shape[1] #Number of predictors from x.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1)/(n - p -1)
adjusted_r2

0.7718717161282501

In [9]:
#Feature selection through p-values (F-regression)
from sklearn.feature_selection import f_regression
f_regression(x, y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [10]:
#p-values
p_values = f_regression(x, y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

In [11]:
#Rounding p-values by 3 decimal points
p_values.round(3)

array([0.   , 0.357])

In [12]:
#Creating a summary table
reg_summary = pd.DataFrame(data = x.columns.values, columns = ["Features"])
reg_summary

Unnamed: 0,Features
0,size
1,year


In [13]:
#Coefficents
reg_summary["Coefficients"] = reg.coef_
reg_summary["p-values"] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,227.700854,0.0
1,year,2916.785327,0.357


In [14]:
#Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
#Transforming x 
x_scaled = scaler.transform(x)
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

In [16]:
#Regression with scaled features
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
#Coefficents
reg.coef_

array([67501.57614152, 13724.39708231])

In [18]:
#Intercept
reg.intercept_

292289.4701599997

In [19]:
#Creating summary table for Regression with scaled features
reg_summary = pd.DataFrame([["Bias"],["size"],["year"]], columns = ["Features"])
reg_summary["Weights"] = reg.intercept_, reg.coef_[0], reg.coef_[1]
reg_summary

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,size,67501.576142
2,year,13724.397082


In [24]:
#Making predictions with the standardized coefficients (i.e. weights)
new_data = pd.DataFrame(data = [[650,2009], [700,2011]], columns = ["size", "year"])
new_data

Unnamed: 0,size,year
0,650,2009
1,700,2011


In [25]:
reg.predict(new_data)

array([71740627.70050645, 75143155.30174717])

In [26]:
#Transforming standardized data
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-0.68485488, -0.76509206],
       [-0.51619152, -0.34004091]])

In [27]:
reg.predict(new_data_scaled)

array([235560.2592592 , 252778.87261325])