In [1]:
import re
import json
import pandas as pd

In [2]:
# Read in property data
with open('../data/raw/properties_dict.json', 'r') as f:
    properties_dict = json.load(f)
data = [(d['postcode'], d['price']) for d in properties_dict.values()]
data = pd.DataFrame(data, columns=['postcode', 'price'])

def process_price(text):
    '''This function takes in price in text form, and return price in right form'''
    price = re.findall(r'@?\$\d+(?:.\d+)?', text.replace(',', ''))
    if len(price) == 0:
        price = re.findall(r'@?\d+(?:.\d+)?', text.replace(',', ''))
    if len(price) > 0:
        out = price[0]
        if out[0] == '$':
            out = float(out[1:])
        else:
            try:
                out = float(out)
            except ValueError:
                out = ''
    else:
        out = ''
    return out

def get_freq(text):
    
    text = text.lower()
    if re.search(r"week[ly]?|wk|p/?w", text) is not None:
        out = 'week'
    elif re.search(r"month[ly]?|p/?m", text) is not None:
        out = 'month'
    # elif 'annum' in text or ' pa' in text:
    elif re.search(r"\bannum|p\.?a\.?\b|annually|year", text) is not None:
        out = 'year'
    else:
        out = 'week'
    return out

data['freq'] = data.price.apply(get_freq)
data['processed_price'] = data.price.apply(process_price)
data = data[data.processed_price != '']
data.loc[data.freq == 'unk', 'freq'] = 'week'

data.loc[data.freq == 'month', 'processed_price'] = data.loc[data.freq == 'month', 'processed_price'] * 12 / 52
data.loc[data.freq == 'year', 'processed_price'] = data.loc[data.freq == 'year', 'processed_price'] / 52

data.drop([2164], inplace=True)
data = data[data.processed_price > 10]

In [4]:
# Calculate the mean price in each postcode
mean_price = data.groupby('postcode').processed_price.mean()
mean_price.to_csv('../data/curated/mean_price_per_week.csv')