In [255]:
import pandas as pd
import re
from tqdm import tqdm

from datetime import datetime
from dateutil import relativedelta

# Take out the unreadable symbols from address
def process_address(txt):
    x = re.findall(r"[\w]+|[&,/#-]+", txt)
    #x = list(filter(lambda a: a != '', x))
    x =  list(filter(lambda a: a != 'amp', x))
    out = ' '.join(x)
    return out;

#Turn string into date type
def year_month_to_date(year,month):
    d1 = str(month)+','+str(year);
    return datetime.strptime(d1, "%B,%Y");


# MODIFY THIS FUNCTION FOR YOUR DATA
# get all the past prices
def get_pastprice_dict(df):
    
    duplicate_d ={}
    
    total = len(df)
    with tqdm(total=total) as pbar:
        for index, row in df.iterrows():
            
            address = row["address"]
            date = row["date"];
            rent = row["rent_pw"];
            
            if address not in duplicate_d:
                
                duplicate_d[address] =[];
            
            duplicate_d[address].append((date,rent));
            
            pbar.update(1)
            
    return duplicate_d

# Get only those that has this year's data
def get_most_recent(d):
    
    false_keys = [];
    
    for address in d :
        
        this_year =datetime.strptime("January,2022", "%B,%Y");
        
        # if the most recent date is not in this year
        
        if d[address][-1][0] < this_year:
            false_keys.append(address);
            
    for key in false_keys:
        
        d.pop(key, None)
        
    return d


#get the past three years monthly rate
def get_rate(recentprice_dict):
    
    monthrate_d ={}
    
    for address in tqdm(recentprice_dict):
        
        total_occurences = len(recentprice_dict[address]);
        
        start_date = recentprice_dict[address][0][0]
        start_price = recentprice_dict[address][0][1]
    
        end_date = recentprice_dict[address][-1][0]
        end_price = recentprice_dict[address][-1][1]
    
        delta_price = end_price - start_price;
    
        # Get the relativedelta between two dates
        delta = relativedelta.relativedelta(end_date, start_date)

        # get months difference
        delta_months = delta.months + (delta.years * 12);
        
        
        if delta_price !=0 and delta_months>0:
            
            #print(delta_price/delta_months)
            
            monthrate_d[address] = (total_occurences,delta_months,delta_price/delta_months);
    
        else:
    
            if delta_months > 6:
        
                monthrate_d[address] = (total_occurences,delta_months,0);
        
    return monthrate_d

def get_from_dict(key,d):
    
    if key in d:
        
        return d[key];
    else:
        return (float('nan'),float('nan'),float('nan'))
        

In [256]:
# Get data
vic_df = pd.read_csv("../data/raw/processed.csv");
feat_df = pd.read_csv("../data/raw/listing_with_features.csv");
geocoded_df = pd.read_csv("../data/raw/geocode.csv");

vic_df['address_processed'] = vic_df['address'].apply(lambda x : process_address(x))
vic2_df = vic_df.drop(columns=['address'])
vic2_df = vic2_df.rename({'address_processed':'address'},axis=1)
left_merged = pd.merge(vic2_df, geocoded_df, how="left", on=["address"]);

vicgecoded_df = left_merged[left_merged['latitude'].isnull() ==False].copy();

In [None]:
# use your own data here

# vicgecoded_df = pd.read_csv("../data/raw/YOURDATA.csv");

In [257]:
vicgecoded_df['date'] = vicgecoded_df.apply(lambda row: year_month_to_date(row['year'],row['month']),axis=1);

In [258]:
threeyears_df = vicgecoded_df[vicgecoded_df['date'] >= '2019-02-01'];
sortedthree_df = threeyears_df.sort_values(by=['date']);

g = sortedthree_df.groupby('address')

ratable_df = g.filter(lambda x: len(x) > 1)
pastprice_dict = get_pastprice_dict(ratable_df)

recentprice_dict = get_most_recent(pastprice_dict);

monthly_rate_d = get_rate(recentprice_dict)

100%|██████████| 144419/144419 [00:06<00:00, 23950.94it/s]
100%|██████████| 25632/25632 [00:00<00:00, 50048.23it/s]


In [259]:
vicgecoded_df['rate_tpl'] = vicgecoded_df['address'].apply(lambda x : get_from_dict(x,monthly_rate_d))
vicgecoded_df[['month_occurences', 'month_delta','month_rate']] = pd.DataFrame(vicgecoded_df['rate_tpl'].tolist(), index=vicgecoded_df.index)
rate_df = vicgecoded_df.dropna()
rate_df = rate_df[['address','latitude','longitude','suburb','month_occurences', 'month_delta','month_rate']]
rate_df = rate_df.drop_duplicates()

In [260]:
rate_df[['month_rate']].describe()

Unnamed: 0,month_rate
count,24309.0
mean,55.999909
std,14192.73064
min,-989500.0
25%,0.0
50%,0.0
75%,1.111111
max,634510.0


In [261]:
rate_df[['month_rate']].quantile(q=0.9)

# Still make sense

month_rate    2.5
Name: 0.9, dtype: float64

In [262]:
rate_df[['month_rate']].quantile(q=0.1)

# Still make sense

month_rate   -1.375
Name: 0.1, dtype: float64

In [263]:
rate_df[['month_rate']].quantile(q=0.95)
# does not make sense
    

month_rate    4.0
Name: 0.95, dtype: float64

In [264]:
lb = rate_df[['month_rate']].quantile(q=0.1)['month_rate'];
ub = rate_df[['month_rate']].quantile(q=0.9)['month_rate'];
rate_df = rate_df[rate_df['month_rate'] >=lb].sort_values(by = 'month_rate');
rate_df = rate_df[rate_df['month_rate'] <=ub].sort_values(by = 'month_rate');

In [265]:
g_count = rate_df.groupby(['suburb']).count()

In [266]:
g_mean = rate_df.groupby(['suburb']).mean()

In [267]:
left_merged = pd.merge(g_count, g_mean, how="left", on=["suburb"],suffixes=('_count', '_mean'));
left_merged['address_count'] = left_merged['address']
suburb_df = left_merged[['address_count','month_rate_mean']]

In [268]:
suburb_df['address_count'].describe()

count    634.000000
mean      30.809148
std       42.331609
min        1.000000
25%        3.000000
50%       13.000000
75%       41.000000
max      275.000000
Name: address_count, dtype: float64

In [269]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 10;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Armstrong Creek,23,1.693621
Leopold,16,1.53475
Rosebud,29,1.512188
Wodonga,59,1.475369
Mount Eliza,11,1.372812
East Geelong,17,1.372806
Bell Post Hill,13,1.372509
Bacchus Marsh,24,1.360399
Strathdale,14,1.331661
Mount Pleasant,11,1.321603


In [253]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 20;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Armstrong Creek,23,1.693621
Rosebud,29,1.512188
Wodonga,59,1.475369
Bacchus Marsh,24,1.360399
Sale,43,1.27454
Grovedale,40,1.260201
Kilmore,21,1.196138
Lara,29,1.189164
Traralgon,87,1.189086
Norlane,22,1.187318


In [254]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 30;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Wodonga,59,1.475369
Sale,43,1.27454
Grovedale,40,1.260201
Traralgon,87,1.189086
Newtown,31,1.113302
Officer,49,1.10016
Belmont,55,1.082384
Highton,55,1.077504
Horsham,65,1.033675
Warragul,41,1.012534


In [270]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 40;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Wodonga,59,1.475369
Sale,43,1.27454
Traralgon,87,1.189086
Officer,49,1.10016
Belmont,55,1.082384
Highton,55,1.077504
Horsham,65,1.033675
Warragul,41,1.012534
Carrum Downs,47,1.008954
Sebastopol,44,1.000895


In [241]:
# metro_melb is a list of all LGAs of metropolitan melbourne
# that is defined https://liveinmelbourne.vic.gov.au/discover/melbourne-victoria/metropolitan-melbourne
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 
              'Glen Eira', 'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 
              'Maribyrnong', 'Maroondah', 'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 
              'Mornington Peninsula', 'Nillumbik', 'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea',
              'Wyndham', 'Yarra', 'Yarra Ranges']

In [247]:
suburb_df[suburb_df.index.isin(metro_melb)] #DOES NOT HAVE ENOUGH DATA

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Frankston,141,0.763199
Maribyrnong,113,-0.003673
Melbourne,163,0.031653
Melton,35,0.286128
