In [2]:
import pandas as pd
import re

In [3]:
def match_percentage(txt):
    percent = re.findall("\d+\.?\d*", str(txt))
    if percent:
        return float(percent[0])/100
    else:
        return None
    
def match_price(txt):
    price = re.findall("\d+\.?\d*[mMkK]?", str(txt))
    if price:
        if price[0][-1] in 'mM': # Price in millions
            return float(price[0][0:-1])*1000000
        elif price[0][-1] in 'kK': # price in thousands
            return float(price[0][0:-1])*1000
        else:
            return float(price[0])
    else:
        return None
                        
def match_int(txt):
    integer = re.findall("\d+", str(txt))
    if integer:
        return int(integer[0])
    else:
        return None                         
            
def match_population(txt):
    integer = re.findall("\d+", str(txt).replace(',',''))
    if integer:
        return int(integer[0])
    else:
        return None   
    
def match_range(txt):
    int_range = re.findall("\d+ to \d+", str(txt))
    if int_range:
        return int_range[0]
    else:
        return None

In [8]:
raw_df = pd.read_json("../data/raw/scrape_2022-08-31_07-50-15-714711.json")

In [5]:
cur_df = pd.DataFrame()

In [6]:
cur_df["neighbourhood_under_20"] = raw_df["neighbourhood_under_20"].apply(match_percentage)
cur_df["neighbourhood_20_to_39"] = raw_df["neighbourhood_20_to_39"].apply(match_percentage)
cur_df["neighbourhood_40_to_59"] = raw_df["neighbourhood_40_to_59"].apply(match_percentage)
cur_df["neighbourhood_above_60"] = raw_df["neighbourhood_above_60"].apply(match_percentage)
cur_df["neighbourhood_long_term_residents"] = raw_df["neighbourhood_long_term_residents"].apply(match_percentage)
cur_df["neighbourhood_owners"] = raw_df["neighbourhood_owners"].apply(match_percentage)
cur_df["neighbourhood_renter"] = raw_df["neighbourhood_renter"].apply(match_percentage)
cur_df["neighbourhood_family"] = raw_df["neighbourhood_family"].apply(match_percentage)
cur_df["neighbourhood_single"] = raw_df["neighbourhood_single"].apply(match_percentage)
cur_df["performance_median_price"] = raw_df["performance_median_price"].apply(match_price)
cur_df["performance_auction_clearance"] = raw_df["performance_auction_clearance"].apply(match_percentage)
cur_df["performance_sold_this_year"] = raw_df["performance_sold_this_year"].apply(match_int)
cur_df["performance_avg_days_on_market"] = raw_df["performance_avg_days_on_market"].apply(match_int)
cur_df["demographic_population"] = raw_df["demographic_population"].apply(match_population)
cur_df["demographic_average_age_low"] = raw_df["demographic_average_age"].apply(match_range)
cur_df["demographic_owner"] = raw_df["demographic_owner"].apply(match_percentage)
cur_df["demographic_renter"] = raw_df["demographic_renter"].apply(match_percentage)
cur_df["demographic_family"] = raw_df["demographic_family"].apply(match_percentage)
cur_df["demographic_single"] = raw_df["demographic_single"].apply(match_percentage)
cur_df["latitude"] = raw_df["latitude"].astype(float)
cur_df["longitude"] = raw_df["longitude"].astype(float)

In [7]:
cur_df

Unnamed: 0,neighbourhood_under_20,neighbourhood_20_to_39,neighbourhood_40_to_59,neighbourhood_above_60,neighbourhood_long_term_residents,neighbourhood_owners,neighbourhood_renter,neighbourhood_family,neighbourhood_single,performance_median_price,...,performance_sold_this_year,performance_avg_days_on_market,demographic_population,demographic_average_age_low,demographic_owner,demographic_renter,demographic_family,demographic_single,latitude,longitude
0,0.10,0.74,0.13,0.03,,0.26,0.74,0.44,0.56,360000.0,...,28.0,116.0,5498.0,20 to 39,0.36,0.64,0.27,0.73,-37.810221,144.950693
1,0.12,0.33,0.27,0.29,0.30,0.48,0.52,0.53,0.47,520000.0,...,51.0,56.0,4963.0,20 to 39,0.45,0.55,0.36,0.64,-37.809191,144.981595
2,0.20,0.20,0.21,0.39,0.40,0.55,0.45,0.59,0.41,677000.0,...,185.0,83.0,14330.0,20 to 39,0.55,0.45,0.44,0.56,-37.831693,145.056056
3,0.25,0.31,0.24,0.20,0.53,0.59,0.41,0.58,0.42,1715000.0,...,23.0,91.0,10056.0,20 to 39,0.69,0.31,0.52,0.48,-37.853598,145.039578
4,0.14,0.37,0.32,0.18,0.34,0.44,0.56,0.46,0.54,420000.0,...,269.0,77.0,20218.0,20 to 39,0.35,0.65,0.22,0.78,-37.870931,144.977469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.29,0.30,0.27,0.15,0.54,0.58,0.42,0.50,0.50,780000.0,...,65.0,71.0,14144.0,20 to 39,0.51,0.49,0.35,0.65,-37.757557,144.945375
997,0.17,0.37,0.24,0.23,0.61,0.26,0.74,0.45,0.55,443000.0,...,29.0,56.0,12339.0,20 to 39,0.48,0.52,0.31,0.69,-37.778485,144.988253
998,0.17,0.37,0.24,0.23,0.61,0.26,0.74,0.45,0.55,443000.0,...,29.0,56.0,12339.0,20 to 39,0.48,0.52,0.31,0.69,-37.778485,144.988253
999,0.10,0.53,0.25,0.12,0.28,0.30,0.70,0.40,0.60,636000.0,...,174.0,44.0,13100.0,20 to 39,0.46,0.54,0.33,0.67,-37.863198,145.009049
