In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

##Load Data

In [None]:
df = pd.read_csv('real_estate_price_size_year.csv')
df.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [None]:
df.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


## INDEPENDENT AND DEPENDENT VARIABLE

In [None]:
x = df[['size','year']]
y = df['price']

##Regression

####EXPLORE DATA

In [None]:
reg = LinearRegression()
reg.fit(x,y)

In [None]:
reg.coef_ # model coefficient

array([ 227.70085401, 2916.78532684])

In [None]:
reg.intercept_ # intercept of model

-5772267.017463279

####CALCULATING R-SQUARED

In [12]:
reg.score(x,y)

0.7764803683276795

#####FORMULA FOR R^2

$R^2_{adj.} = 1-(1-R^2)*\frac{n-1}{n-p-1}$

In [13]:
x.shape

(100, 2)

In [15]:
r2 = reg.score(x,y) #store previous r-squared value
# n=100 total observations
# p=2 no of predictors

# implementing n and p using x.shape

n = x.shape[0] #index in x.shape
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.7718717161282502

In [16]:
# There are different ways to solve this problem
# To make it as easy and interpretable as possible, we have preserved the original code
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [17]:
# Here's the result
adj_r2(x,y)

0.7718717161282502

#### FEATURE SELECTION

In [18]:
from sklearn.feature_selection import f_regression

In [20]:
f_regression(x,y) # returnd array[f-statistics], array[p-values]

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [23]:
p_value = f_regression(x,y)[1]
p_value

array([8.12763222e-31, 3.57340758e-01])

In [24]:
p_value.round(3)

array([0.   , 0.357])

##Creating a Summary Table

In [27]:
##reg_summary = pd.DataFrame([['size'],['year']],columns =['Features'])

reg_summary = pd.DataFrame(data=x.columns.values,columns =['Features'])
reg_summary

Unnamed: 0,Features
0,size
1,year


In [28]:
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_value.round(3)

In [29]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,227.700854,0.0
1,year,2916.785327,0.357
