In [23]:
import pandas as pd
import re
from tqdm import tqdm

from datetime import datetime
from dateutil import relativedelta

# Take out the unreadable symbols from address
def process_address(txt):
    x = re.findall(r"[\w]+|[&,/#-]+", txt)
    #x = list(filter(lambda a: a != '', x))
    x =  list(filter(lambda a: a != 'amp', x))
    out = ' '.join(x)
    return out;

#Turn string into date type
def year_month_to_date(year,month):
    d1 = str(month)+','+str(year);
    return datetime.strptime(d1, "%B,%Y");


# MODIFY THIS FUNCTION FOR YOUR DATA
# get all the past prices
def get_pastprice_dict(df):
    
    duplicate_d ={}
    
    total = len(df)
    with tqdm(total=total) as pbar:
        for index, row in df.iterrows():
            
            address = row["address"]
            date = row["date"];
            rent = row["rent_pw"];
            
            if address not in duplicate_d:
                
                duplicate_d[address] =[];
            
            duplicate_d[address].append((date,rent));
            
            pbar.update(1)
            
    return duplicate_d

# Get only those that has this year's data
def get_most_recent(d):
    
    false_keys = [];
    
    for address in d :
        
        this_year =datetime.strptime("January,2022", "%B,%Y");
        
        # if the most recent date is not in this year
        
        if d[address][-1][0] < this_year:
            false_keys.append(address);
            
    for key in false_keys:
        
        d.pop(key, None)
        
    return d


#get the past three years monthly rate
def get_rate(recentprice_dict):
    
    monthrate_d ={}
    
    for address in tqdm(recentprice_dict):
        
        total_occurences = len(recentprice_dict[address]);
        
        start_date = recentprice_dict[address][0][0]
        start_price = recentprice_dict[address][0][1]
    
        end_date = recentprice_dict[address][-1][0]
        end_price = recentprice_dict[address][-1][1]
    
        delta_price = end_price - start_price;
    
        # Get the relativedelta between two dates
        delta = relativedelta.relativedelta(end_date, start_date)

        # get months difference
        delta_months = delta.months + (delta.years * 12);
        
        
        if delta_price !=0 and delta_months>0:
            
            #print(delta_price/delta_months)
            
            monthrate_d[address] = (total_occurences,delta_months,delta_price/delta_months);
    
        else:
    
            if delta_months > 6:
        
                monthrate_d[address] = (total_occurences,delta_months,0);
        
    return monthrate_d

def get_from_dict(key,d):
    
    if key in d:
        
        return d[key];
    else:
        return (float('nan'),float('nan'),float('nan'))
        

In [24]:
# Get data
vic_df = pd.read_csv("../data/raw/processed.csv");

vic_df['address_processed'] = vic_df['address'].apply(lambda x : process_address(x))
vic2_df = vic_df.drop(columns=['address'])
vic2_df = vic2_df.rename({'address_processed':'address'},axis=1)


In [25]:
vic_df.head(3)

Unnamed: 0,year,month,bed,bath,car,address,suburb,code,type_1,rent_pw,address_processed
0,2015,March,2,1,,"APARTMENT /110 RUPERT ST, WEST FOOTSCRAY",West Footscray,3012,unit/apmt,270.0,"APARTMENT / 110 RUPERT ST , WEST FOOTSCRAY"
1,2015,February,2,1,1.0,"23 / 44 EVERARD STREET (CNR ESSEX ST), WEST FO...",West Footscray,3012,unit/apmt,355.0,"23 / 44 EVERARD STREET CNR ESSEX ST , WEST FOO..."
2,2015,February,2,1,,"8 WELLINGTON ST, WEST FOOTSCRAY",West Footscray,3012,house,295.0,"8 WELLINGTON ST , WEST FOOTSCRAY"


In [26]:
len(vic_df)

3389448

In [27]:
vic2_df['date'] = vic2_df.apply(lambda row: year_month_to_date(row['year'],row['month']),axis=1);

In [28]:
threeyears_df = vic2_df[vic2_df['date'] >= '2019-01-01'];
sortedthree_df = threeyears_df.sort_values(by=['date']);

# take out the single occurence
g = sortedthree_df.groupby('address')
ratable_df = g.filter(lambda x: len(x) > 1)

pastprice_dict = get_pastprice_dict(ratable_df)
recentprice_dict = get_most_recent(pastprice_dict);

monthly_rate_d = get_rate(recentprice_dict)

100%|██████████| 683830/683830 [00:29<00:00, 23569.82it/s]
100%|██████████| 91596/91596 [00:01<00:00, 51124.77it/s]


In [30]:
vic2_df['rate_tpl'] = vic2_df['address'].apply(lambda x : get_from_dict(x,monthly_rate_d))
vic2_df[['month_occurences', 'month_delta','month_rate']] = pd.DataFrame(vic2_df['rate_tpl'].tolist(), index=vic2_df.index)
rate_df = vic2_df.dropna()
rate_df = rate_df[['address','suburb','month_occurences', 'month_delta','month_rate']]
rate_df = rate_df.drop_duplicates()

In [31]:
rate_df[['month_rate']].describe()

Unnamed: 0,month_rate
count,70008.0
mean,-736.8633
std,26846.47
min,-1049550.0
25%,-0.3333333
50%,0.0
75%,1.153846
max,1149350.0


In [32]:
rate_df[['month_rate']].quantile(q=0.9)

# Still make sense

month_rate    2.822185
Name: 0.9, dtype: float64

In [33]:
rate_df[['month_rate']].quantile(q=0.1)

# Still make sense

month_rate   -2.058824
Name: 0.1, dtype: float64

In [34]:
rate_df[['month_rate']].quantile(q=0.95)
# does not make sense
    

month_rate    5.0
Name: 0.95, dtype: float64

In [35]:
lb = rate_df[['month_rate']].quantile(q=0.1)['month_rate'];
ub = rate_df[['month_rate']].quantile(q=0.9)['month_rate'];
rate_df = rate_df[rate_df['month_rate'] >=lb].sort_values(by = 'month_rate');
rate_df = rate_df[rate_df['month_rate'] <=ub].sort_values(by = 'month_rate');

In [36]:
g_count = rate_df.groupby(['suburb']).count()

In [37]:
g_mean = rate_df.groupby(['suburb']).mean()

In [38]:
left_merged = pd.merge(g_count, g_mean, how="left", on=["suburb"],suffixes=('_count', '_mean'));
left_merged['address_count'] = left_merged['address']
suburb_df = left_merged[['address_count','month_rate_mean']]

In [39]:
suburb_df['address_count'].describe()

count    814.000000
mean      68.831695
std      110.324501
min        1.000000
25%        2.000000
50%       19.000000
75%       89.000000
max      775.000000
Name: address_count, dtype: float64

In [40]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 10;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Capel Sound,17,1.716019
Tootgarook,15,1.62115
Sale,129,1.579713
Woodend,20,1.53907
Mount Martha,32,1.481044
Leopold,42,1.447648
Beechworth,16,1.431504
Killara,13,1.391201
Charlemont,27,1.362827
Wodonga,221,1.362591


In [41]:
suburb_df = left_merged[['address_count','month_rate_mean']]
acceptablenumber_of_prop = 20;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Sale,129,1.579713
Mount Martha,32,1.481044
Leopold,42,1.447648
Charlemont,27,1.362827
Wodonga,221,1.362591
Bairnsdale,46,1.327361
Quarry Hill,44,1.302821
Strathdale,28,1.300079
Benalla,80,1.298682
Curlewis,32,1.296659


In [42]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 30;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Sale,129,1.579713
Mount Martha,32,1.481044
Leopold,42,1.447648
Wodonga,221,1.362591
Bairnsdale,46,1.327361
Quarry Hill,44,1.302821
Benalla,80,1.298682
Curlewis,32,1.296659
Armstrong Creek,90,1.292695
Bell Post Hill,43,1.287034


In [43]:
suburb_df = left_merged[['address_count','month_rate_mean']]

acceptablenumber_of_prop = 40;

suburb_df = suburb_df[suburb_df['address_count']>acceptablenumber_of_prop];
suburb_df.sort_values(by = 'month_rate_mean',ascending=False).head(10)

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Sale,129,1.579713
Leopold,42,1.447648
Wodonga,221,1.362591
Bairnsdale,46,1.327361
Quarry Hill,44,1.302821
Benalla,80,1.298682
Armstrong Creek,90,1.292695
Bell Post Hill,43,1.287034
East Geelong,45,1.235245
Torquay,93,1.223593


In [44]:
# metro_melb is a list of all LGAs of metropolitan melbourne
# that is defined https://liveinmelbourne.vic.gov.au/discover/melbourne-victoria/metropolitan-melbourne
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 
              'Glen Eira', 'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 
              'Maribyrnong', 'Maroondah', 'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 
              'Mornington Peninsula', 'Nillumbik', 'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea',
              'Wyndham', 'Yarra', 'Yarra Ranges']

In [45]:
suburb_df[suburb_df.index.isin(metro_melb)] #DOES NOT HAVE ENOUGH DATA

Unnamed: 0_level_0,address_count,month_rate_mean
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1
Frankston,405,0.755455
Maribyrnong,299,-0.072907
Melbourne,571,-0.144563
Melton,115,0.254641
