# Linear Regression
This noteboooks is for linear regression model for stage one. 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.formula.api import ols, glm
from statsmodels.api import families
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
df = pd.read_csv("../data/curated/domain_final.csv")
df['suburb_population'] = df['suburb_population'].str.replace(",", "")
df['suburb_sold'] = df['suburb_sold'].str.replace("-", "0")
df['suburb_avg_day'] = df['suburb_avg_day'].str.replace("-", "0")
df[['suburb_population','suburb_avg_day','suburb_sold']]\
= df[['suburb_population','suburb_avg_day','suburb_sold']].apply(pd.to_numeric)

In [None]:
# aggregate data by psotcode
df_postcode = df\
                .groupby('postcode') \
                .agg(
                    {
                        'cost':'mean',
                        'Median_rent_weekly':'mean',
                        'school_distance':'mean',
                        'dist_nearest_busstop':'mean',
                        'dist_nearest_trainstation':'mean',
                        'dist_nearest_primary_school':'mean',
                        'dist_nearest_secondary_school':'mean',
                        'Beds':'mean',
                        'Bath':'mean',
                        'Parking':'mean',
                        'avg_income':'mean',
                        'Tot_P_P':'mean',
                        'Median_age_persons': 'mean',
                        'suburb_sold': 'mean',
                        'suburb_avg_day': 'mean',
                        'suburb_population': 'mean',
                        'suburb_owner': 'mean',
                        'suburb_renter':'mean',
                        'suburb_family':'mean',
                        'suburb_single':'mean',
                    }
                ) 
df_postcode.head()

In [None]:
# check linear relationship between features and respond varible
sns.lmplot(x="suburb_population", y="cost", data=df_postcode)

In [None]:
sns.lmplot(x="dist_nearest_busstop", y="cost", data=df_postcode)

In [None]:
sns.lmplot(x="avg_income", y="cost", data=df_postcode)

In [None]:
# check correlation of features

CORR_COLS = [
    'cost','school_distance','suburb_population','suburb_renter',
    'suburb_single','dist_nearest_secondary_school', 'dist_nearest_busstop',
    'dist_nearest_trainstation' , 'dist_nearest_primary_school',
    'avg_income','Tot_P_P','Median_age_persons','Beds','Bath','Parking'
]


fig,ax = plt.subplots(figsize=(17,5)) 
sns.heatmap(df_postcode[CORR_COLS].corr(), cmap = plt.cm.RdYlBu_r,annot=True, ax = ax)

plt.title('Pearson Correlation Metric')
#plt.savefig('../plots/corr.png')

In [None]:
CORR_COLS = [
    'cost','school_distance','suburb_population','suburb_renter',
    'dist_nearest_secondary_school','dist_nearest_trainstation' , 
    'avg_income','Tot_P_P','Median_age_persons','Beds','Bath','Parking'
]


fig,ax = plt.subplots(figsize=(20,5)) 
sns.heatmap(df_postcode[CORR_COLS].corr(), cmap = plt.cm.RdYlBu_r,annot=True, ax = ax)

plt.title('Pearson Correlation Metric')
#plt.savefig('../plots/corr.png')

In [None]:
x = df_postcode[['cost','suburb_population','suburb_renter','suburb_family',
    'dist_nearest_secondary_school','dist_nearest_trainstation' , 
    'avg_income','Tot_P_P','Median_age_persons','Beds','Bath','Parking']].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df1 = pd.DataFrame(x_scaled, columns= ['cost','suburb_population','suburb_renter','suburb_family',
    'dist_nearest_secondary_school','dist_nearest_trainstation' , 
    'avg_income','Tot_P_P','Median_age_persons','Beds','Bath','Parking'] )

In [None]:
df1

In [None]:
# training model
fit = ols(
    formula="cost ~ suburb_population + suburb_renter + suburb_family + dist_nearest_secondary_school + avg_income \
    + Tot_P_P + dist_nearest_trainstation + Median_age_persons + Bath + Parking",
    data=df1
).fit()
print(fit.summary())

In [None]:
fit = ols(
    formula="cost ~ avg_income + Tot_P_P + dist_nearest_trainstation + Median_age_persons + Bath + Parking",
    data=df1
).fit()
print(fit.summary())