In [None]:
%cd ../scripts
from backwardselim import back_ward_elim
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, glm
import statsmodels.api as sm
import re

# Preparing data

Reading + Merging

In [None]:
data = pd.read_csv("../data/curated/listings_with_distances.csv")
school = pd.read_csv("../data/curated/Closest_and_average_dist_school.csv")
neighbour = pd.read_csv("../data/curated/neigh_stat.csv")

In [None]:
data = pd.merge(data, school, left_on='id', right_on='listing_id')
data = pd.merge(data, neighbour, left_on='id', right_on='id')

Extracting suburb from the address

In [None]:
def get_Suburb(x:str):
    try:
        suburb = re.findall(", [A-Za-z ]+ VIC", x)[0]
        suburb = re.sub(", || VIC", "", suburb)
    except IndexError:
        suburb = None
    return suburb
data["Suburb"] = data["address"].apply(lambda x: get_Suburb(x))

In [None]:
data.to_csv("../data/curated/listings_with_distances.csv", index = False)

# Visualisation the relationships/correlations between each features and rental prices

In [None]:
COLS = ["price", "Suburb", "beds","propertyType", "closest_school", "dist_closest_school","ave_dist_3_schools", "ClosestDstToShoppingCentre", "DstToCBD", "ClosestShop", "ClosestDstToStation", "ClosestStation", "ClosestDstToUni", "ClosestUni", "age0To19", "age20To39", "age40To59","age60Plus","longTermResident","owner","renter","family","single"]
CORR_COLS = ["price", "ClosestDstToShoppingCentre", "DstToCBD", "ClosestDstToStation", "ClosestDstToUni", "dist_closest_school","ave_dist_3_schools", "age0To19", "age20To39", "age40To59","age60Plus","longTermResident","owner","renter","family","single"]
DISCRETE_COLS = ["Suburb", "closest_school", "ClosestShop", "ClosestStation", "ClosestUni"]
data = data[COLS]

Making sure to omit data where the distance was found to be too far, as it is not feasible to assume that the residents will utilise these facilities, and hence is considered irrelevant to determine the rental prices 

In [None]:
data = data[data['ClosestDstToShoppingCentre'] < 100000]
data = data[data['ClosestDstToStation'] < 100000]
data = data[data['DstToCBD'] < 100000] 
data = data[data['ClosestDstToUni'] < 100000]


In [None]:
bed_4_house = data[(data["beds"] == 4) & (data["propertyType"] == "House")]
bed_3_house = data[(data["beds"] == 3) & (data["propertyType"] == "House")]
bed_2_house = data[(data["beds"] == 2) & (data["propertyType"] == "House")]
bed_3_flat = data[(data["beds"] == 3) & (data["propertyType"] == "Apartment / Unit / Flat")]
bed_2_flat = data[(data["beds"] == 2) & (data["propertyType"] == "Apartment / Unit / Flat")]
bed_1_flat = data[(data["beds"] == 1) & (data["propertyType"] == "Apartment / Unit / Flat")]

Starting off with examining the correlations

In [None]:
bed_2_house[CORR_COLS].corr()

In [None]:
bed_1_flat[CORR_COLS].corr()

In [None]:
bed_2_flat[CORR_COLS].corr()

Followed by plotting to examine the details fo the relationship

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.regplot("ClosestDstToShoppingCentre", "price", color="blue", data=bed_2_house)
sns.regplot("ClosestDstToShoppingCentre", "price", color="red", data=bed_2_flat)
sns.regplot("ClosestDstToShoppingCentre", "price", color="orange", data=bed_1_flat)
plt.xlim(0, 6000)
plt.ylim(0, 1000)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.regplot("ClosestDstToStation", "price", color="blue", data=bed_2_house)
sns.regplot("ClosestDstToStation", "price", color="red", data=bed_2_flat)
sns.regplot("ClosestDstToStation", "price", color="orange", data=bed_1_flat)
plt.xlim(0, 6000)
plt.ylim(0, 1000)

## ANOVAS

Removing outliers for each suburb

In [None]:
import math
def remove_outlier(dataframe):
    """
    Removes outlier in a dataframe
    Outliers are identified as any records outside of [mean-2*standard dev, mean+2*standard dev] of the data
    """
    df = pd.DataFrame()
    for suburb in dataframe["Suburb"].unique():
        prices = dataframe[dataframe["Suburb"] == suburb]["price"]
        if len(prices) > 0:
            mean = prices.sum()/len(prices)
            sd = math.sqrt(((prices-mean)*(prices-mean)).sum()/(len(prices)))
            df = pd.concat([df, dataframe[(dataframe.Suburb == suburb) & (dataframe.price < mean + 2*sd) & (dataframe.price > mean - 2*sd)]])
    return df


In [None]:
bed_1_flat_clean = remove_outlier(bed_1_flat)
bed_2_flat_clean = remove_outlier(bed_2_flat)
bed_2_house_clean = remove_outlier(bed_2_house)

### Backward elimination

To test which features are the most significant in determining the price of the listing at instance level, and see how much of the variance in rental prices can be explained by these features

In [None]:
bed_1_flat_anova, bed_1_flat_model = back_ward_elim(bed_1_flat_clean)
bed_2_flat_anova, bed_2_flat_model = back_ward_elim(bed_2_flat_clean)
bed_2_house_anova, bed_2_house_model = back_ward_elim(bed_2_house_clean)

In [None]:
bed_1_flat_anova

In [None]:
bed_2_flat_anova

In [None]:
bed_2_house_anova

In [None]:
bed_1_flat_model.summary()

In [None]:
bed_2_flat_model.summary()

In [None]:
bed_2_house_model.summary()