# Top Suburbs

In [None]:
import pandas as pd
from math import floor, ceil
import _pickle as cPickle

In [None]:
# most affrodable suburb
result_suburbs = []

In [None]:
df = pd.read_csv('../data/curated/processed_listing.csv')

In [None]:
df = df.drop_duplicates(subset='address')

In [None]:
# read in the postcode information
df = df.rename(columns={'code': 'postcode'}, errors="coerce")
postcode_df = pd.read_csv('../data/raw/abs/australian_postcodes.csv')
# only retain information that is relavent for faster running time
postcode_df = postcode_df.loc[postcode_df['state'] == 'VIC']
postcode_df = postcode_df[['postcode', 'lgaregion']]
df = pd.merge(df, postcode_df, on='postcode')
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 'Glen Eira',
              'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 'Maribyrnong', 'Maroondah',
              'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 'Mornington Peninsula', 'Nillumbik',
              'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea', 'Wyndham', 'Yarra', 'Yarra Ranges']
df = df.loc[df['lgaregion'].isin(metro_melb)]

In [None]:
stats_df = df[['suburb','weekly_rent']].groupby('suburb').mean()
stats_df = stats_df.merge(df.groupby('suburb').count().iloc[:,0], on = 'suburb')
stats_df = stats_df.set_axis(['ave_weekly_rent', 'n_listing'], axis=1)

In [None]:
min_listing = stats_df.n_listing.quantile(0.25)
print(min_listing)

In [None]:
stats_df = stats_df.sort_values(by = 'ave_weekly_rent', ascending = True)
stats_df = stats_df.loc[stats_df['n_listing'] >= min_listing]
stats_df.head()

In [None]:
result_suburbs.append(stats_df.iloc[0:2].index.tolist())

In [None]:
# most livable suburb
# livable suburb is very hard to define as it might have very complex matrix
# such as Social Infrastructure, Walkability, Public Transport, Public Open Space, Housing Affordability, Local Employment defined by RMIT
# this features cannot be easily processed by machine
# here a higher price means a better suburb

In [None]:

full_df = pd.read_csv('../data/curated/full_listing.csv').iloc[:,1:]
remove = [ 'address', 'postcode', 'url', 'loc_address', 'lat', 'lon', 'list_date', 'list_history', 'list_count', 'SA2_NAME_2016', 'SA2', 'lgaregion', 'primary_school_name', 'secondary_school_name', 'pri_lat', 'pri_lon', 'sec_lat', 'sec_lon', 'closest_ed_name', 'ed_lat', 'ed_lon', 'train_stop', 'train_lat', 'train_lon']
full_df = full_df.drop(remove, axis=1)

In [None]:
full_df = full_df[full_df.groupby('suburb')['suburb'].transform('count').ge(min_listing)]

In [None]:
rank_df = full_df.groupby('suburb').mean()
# still assume the number of rooms
rank_df[['bed', 'bath', 'car', 'type_code']] = [2, 1, 1, 4]

In [None]:
with open('../models/rf_regressor', 'rb') as f:
    rf = cPickle.load(f)
x = rank_df.drop(['weekly_rent'], axis=1).dropna()
x.head()

In [None]:
prediction = pd.DataFrame({'suburb': x.index.tolist(), 'score': rf.predict(x)}).sort_values(by='score', ascending=False)
prediction['score'] = (prediction['score']-prediction['score'].mean())/prediction['score'].std()
prediction

In [None]:
bins = []
i = floor(min(prediction['score']))
while i < ceil(max(prediction['score'])):
    bins.append(i)
    i += 0.5
print(bins)

In [None]:
prediction.hist(column='score')

In [None]:
result_suburbs.append(prediction.iloc[0:3,0].tolist())

In [None]:
result_suburbs

In [None]:
result_suburbs.append(prediction.iloc[-2:,0].tolist())

In [None]:
result_suburbs = [item for sublist in result_suburbs for item in sublist]

In [None]:
for i in result_suburbs:
    if len(full_df[full_df['suburb']==i]) > 0:
        print(f'=============={i}==============')
        print('Listing count: ', len(full_df[full_df['suburb']==i]))
        print(full_df[full_df['suburb']==i].mean(axis=0))