# Linear regression - sklearn

Multiple linear regression using sklearn to predict housing price based on size and year of the house.

## Import libraries

In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

## Load data

In [123]:
data = pd.read_csv('real_estate_price_size_year.csv')
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


## Regression

In [124]:
# Dependent and independent variables
x = data[['size','year']]
y = data['price']

In [125]:
# Scale inputs
from sklearn.preprocessing import StandardScaler

# create an empty StandardScaler object
scaler = StandardScaler() 
# fit the input data
scaler.fit(x) 
x_scaled = scaler.transform(x)

In [126]:
# Regression
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression()

### Explore parameters of regression

In [127]:
reg.coef_ # coefficients

array([67501.57614152, 13724.39708231])

In [128]:
reg.intercept_ # bias of intercept

292289.4701599997

In [129]:
# Create a table with these values
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary ['Coefficients'] = reg.coef_
reg_summary ['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,67501.576142,0.0
1,year,13724.397082,0.357


### Calculate R-squared and Adjusted R-squared

In [130]:
# R-squared of the regression analysis
reg.score(x_scaled,y)

0.7764803683276793

##### Formula for adjusted R^2
$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [131]:
# Calculate the adjusted R-squared
r2 = reg.score(x_scaled,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.77187171612825

## Predictions

In [132]:
# Example: predict the price of an apartment with 600 square feet from 2012
data_new = [[750,2009]]
data_new_scaled = scaler.transform(data_new)
reg.predict(data_new_scaled)



array([258330.34465995])