In [279]:
import pandas as pd
import re
from tqdm import tqdm

from datetime import datetime
from dateutil import relativedelta

# Take out the unreadable symbols from address
def process_address(txt):
    x = re.findall(r"[\w]+|[&,/#-]+", txt)
    #x = list(filter(lambda a: a != '', x))
    x =  list(filter(lambda a: a != 'amp', x))
    out = ' '.join(x)
    return out;

#Turn string into date type
def year_month_to_date(year,month):
    d1 = str(month)+','+str(year);
    return datetime.strptime(d1, "%B,%Y");


# MODIFY THIS FUNCTION FOR YOUR DATA
# get all the past prices
def get_pastprice_dict(df):
    
    duplicate_d ={}
    
    total = len(df)
    with tqdm(total=total) as pbar:
        for index, row in df.iterrows():
            
            address = row["address"]
            date = row["date"];
            rent = row["rent_pw"];
            
            if address not in duplicate_d:
                
                duplicate_d[address] =[];
            
            duplicate_d[address].append((date,rent));
            
            pbar.update(1)
            
    return duplicate_d

# Get only those that has this year's data
def get_most_recent(d):
    
    false_keys = [];
    
    for address in d :
        
        this_year =datetime.strptime("January,2022", "%B,%Y");
        
        # if the most recent date is not in this year
        
        if d[address][-1][0] < this_year:
            false_keys.append(address);
            
    for key in false_keys:
        
        d.pop(key, None)
        
    return d


#get the past three years monthly rate
def get_rate(recentprice_dict):
    
    monthrate_d ={}
    
    for address in tqdm(recentprice_dict):
        
        total_occurences = len(recentprice_dict[address]);
        
        start_date = recentprice_dict[address][0][0]
        start_price = recentprice_dict[address][0][1]
    
        end_date = recentprice_dict[address][-1][0]
        end_price = recentprice_dict[address][-1][1]
    
        delta_price = end_price - start_price;
    
        # Get the relativedelta between two dates
        delta = relativedelta.relativedelta(end_date, start_date)

        # get months difference
        delta_months = delta.months + (delta.years * 12);
        
        
        if delta_price !=0 and delta_months>0:
            
            #print(delta_price/delta_months)
            
            monthrate_d[address] = (total_occurences,delta_months,delta_price/delta_months);
    
        else:
    
            if delta_months > 6:
        
                monthrate_d[address] = (total_occurences,delta_months,0);
        
    return monthrate_d

def get_from_dict(key,d):
    
    if key in d:
        
        return d[key];
    else:
        return (float('nan'),float('nan'),float('nan'))
        

In [280]:
# Get data
vic_df = pd.read_csv("../data/raw/processed.csv");
geocoded_df = pd.read_csv("../data/raw/geocode.csv");

vic_df['address_processed'] = vic_df['address'].apply(lambda x : process_address(x))
vic2_df = vic_df.drop(columns=['address'])
vic2_df = vic2_df.rename({'address_processed':'address'},axis=1)
left_merged = pd.merge(vic2_df, geocoded_df, how="left", on=["address"]);

vicgecoded_df = left_merged[left_merged['latitude'].isnull() ==False].copy();

In [281]:
vic_df.head(3)

Unnamed: 0,year,month,bed,bath,car,address,suburb,code,type_1,rent_pw,address_processed
0,2015,March,2,1,,"APARTMENT /110 RUPERT ST, WEST FOOTSCRAY",West Footscray,3012,unit/apmt,270.0,"APARTMENT / 110 RUPERT ST , WEST FOOTSCRAY"
1,2015,February,2,1,1.0,"23 / 44 EVERARD STREET (CNR ESSEX ST), WEST FO...",West Footscray,3012,unit/apmt,355.0,"23 / 44 EVERARD STREET CNR ESSEX ST , WEST FOO..."
2,2015,February,2,1,,"8 WELLINGTON ST, WEST FOOTSCRAY",West Footscray,3012,house,295.0,"8 WELLINGTON ST , WEST FOOTSCRAY"


In [304]:
len(vic_df)

3389448

In [282]:
geocoded_df.head(5)

Unnamed: 0,address,latitude,longitude
0,"/ 71 SOUTH AVENUE , ALTONA MEADOWS",-37.882201,144.778653
1,"0 ELLIS TRACK , GLADYSDALE",-37.823809,145.66613
2,"0 LANCASTER DRIVE , MANGALORE",-36.895378,145.182803
3,"001 / 2 YARRA BING CRES , BURWOOD",-37.850569,145.12366
4,"001 / 2 YARRA BING CRESCENT , BURWOOD",-37.850569,145.12366


In [283]:
vicgecoded_df.head(3)

Unnamed: 0,year,month,bed,bath,car,suburb,code,type_1,rent_pw,address,latitude,longitude
6,2015,February,3,2,2.0,West Footscray,3012,house,420.0,"12 PARK AVENUE , WEST FOOTSCRAY",-37.808265,144.866029
7,2012,March,3,2,2.0,West Footscray,3012,house,400.0,"12 PARK AVENUE , WEST FOOTSCRAY",-37.808265,144.866029
25,2015,February,1,1,1.0,West Footscray,3012,unit/apmt,200.0,"12 / 697 BARKLY STREET , WEST FOOTSCRAY",-37.796557,144.867772


In [303]:
len(vicgecoded_df)

746824

In [284]:
# use your own data here

# vicgecoded_df = pd.read_csv("../data/raw/YOURDATA.csv");

In [285]:
vicgecoded_df['date'] = vicgecoded_df.apply(lambda row: year_month_to_date(row['year'],row['month']),axis=1);

In [286]:
threeyears_df = vicgecoded_df[vicgecoded_df['date'] >= '2019-01-01'];
sortedthree_df = threeyears_df.sort_values(by=['date']);

g = sortedthree_df.groupby('address')

ratable_df = g.filter(lambda x: len(x) > 1)
pastprice_dict = get_pastprice_dict(ratable_df)

recentprice_dict = get_most_recent(pastprice_dict);

monthly_rate_d = get_rate(recentprice_dict)

100%|██████████| 149064/149064 [00:05<00:00, 25211.98it/s]
100%|██████████| 26236/26236 [00:00<00:00, 48610.49it/s]


In [287]:
vicgecoded_df['rate_tpl'] = vicgecoded_df['address'].apply(lambda x : get_from_dict(x,monthly_rate_d))
vicgecoded_df[['month_occurences', 'month_delta','month_rate']] = pd.DataFrame(vicgecoded_df['rate_tpl'].tolist(), index=vicgecoded_df.index)
rate_df = vicgecoded_df.dropna()
rate_df = rate_df[['address','latitude','longitude','suburb','month_occurences', 'month_delta','month_rate']]
rate_df = rate_df.drop_duplicates()

In [288]:
rate_df[['month_rate']].describe()

Unnamed: 0,month_rate
count,25094.0
mean,138.706916
std,11163.650285
min,-889580.0
25%,0.0
50%,0.125
75%,1.097561
max,634510.0


In [289]:
rate_df[['month_rate']].quantile(q=0.9)

# Still make sense

month_rate    2.5
Name: 0.9, dtype: float64

In [290]:
rate_df[['month_rate']].quantile(q=0.1)

# Still make sense

month_rate   -1.31391
Name: 0.1, dtype: float64

In [291]:
rate_df[['month_rate']].quantile(q=0.95)
# does not make sense
    

month_rate    3.870968
Name: 0.95, dtype: float64

In [292]:
lb = rate_df[['month_rate']].quantile(q=0.1)['month_rate'];
ub = rate_df[['month_rate']].quantile(q=0.9)['month_rate'];
rate_df = rate_df[rate_df['month_rate'] >=lb].sort_values(by = 'month_rate');
rate_df = rate_df[rate_df['month_rate'] <=ub].sort_values(by = 'month_rate');

In [293]:
g_count = rate_df.groupby(['suburb']).count()

In [294]:
g_mean = rate_df.groupby(['suburb']).mean()

In [295]:
left_merged = pd.merge(g_count, g_mean, how="left", on=["suburb"],suffixes=('_count', '_mean'));
left_merged['address_count'] = left_merged['address']
suburb_df = left_merged[['address_count','month_rate_mean']]

In [296]:
suburb_df['address_count'].describe()

count    638.000000
mean      31.711599
std       43.641413
min        1.000000
25%        3.000000
50%       13.000000
75%       42.000000
max      286.000000
Name: address_count, dtype: float64

In [297]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 10;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Armstrong Creek,23,1.66185
Epsom,13,1.537473
Black Hill,12,1.518667
Bairnsdale,15,1.477404
Leopold,16,1.465306
Wodonga,65,1.452
Herne Hill,12,1.436003
Bell Post Hill,13,1.417758
Quarry Hill,14,1.332744
Mount Pleasant,11,1.321603


In [298]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 20;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Armstrong Creek,23,1.66185
Wodonga,65,1.452
Grovedale,42,1.319239
Rosebud,35,1.31418
Sale,43,1.305385
Golden Square,28,1.265066
Kilmore,21,1.237377
Lara,31,1.205822
Norlane,23,1.190496
Traralgon,91,1.181998


In [299]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 30;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Wodonga,65,1.452
Grovedale,42,1.319239
Rosebud,35,1.31418
Sale,43,1.305385
Lara,31,1.205822
Traralgon,91,1.181998
Newtown,32,1.166454
Carrum Downs,50,1.063533
Belmont,58,1.062475
Officer,51,1.047595


In [300]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 40;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Wodonga,65,1.452
Grovedale,42,1.319239
Sale,43,1.305385
Traralgon,91,1.181998
Carrum Downs,50,1.063533
Belmont,58,1.062475
Officer,51,1.047595
Highton,61,1.044877
Sebastopol,45,1.032337
Geelong West,42,1.023616


In [301]:
# metro_melb is a list of all LGAs of metropolitan melbourne
# that is defined https://liveinmelbourne.vic.gov.au/discover/melbourne-victoria/metropolitan-melbourne
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 
              'Glen Eira', 'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 
              'Maribyrnong', 'Maroondah', 'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 
              'Mornington Peninsula', 'Nillumbik', 'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea',
              'Wyndham', 'Yarra', 'Yarra Ranges']

In [302]:
suburb_df[suburb_df.index.isin(metro_melb)] #DOES NOT HAVE ENOUGH DATA

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Frankston,145,0.77846
Maribyrnong,114,0.017278
Melbourne,155,0.063303
