In [39]:
import warnings
import pandas as pd

warnings.filterwarnings('ignore')

# Flats
one_bed_flat = pd.read_csv('../data/landing/rent_history_1 bedroom flat.csv')
two_bed_flat = pd.read_csv('../data/landing/rent_history_2 bedroom flat.csv')
three_bed_flat = pd.read_csv('../data/landing/rent_history_3 bedroom flat.csv')

# Houses
two_bed_house = pd.read_csv('../data/landing/rent_history_2 bedroom house.csv')
three_bed_house = pd.read_csv('../data/landing/rent_history_3 bedroom house.csv')
four_bed_house = pd.read_csv('../data/landing/rent_history_4 bedroom house.csv')

# All properties
all_properties = pd.read_csv('../data/landing/rent_history_All properties.csv')

In [40]:
def format_df(df):
    '''
        A function that preprocesses the dataframes, particularly its columns to the desired format.
        The df, originally from a xlsx file, had mismatched columns. 
    '''
    # Drop the first column (unimportant)
    df = df.drop(columns=df.columns[0])

    # Duplicate month to the column to the right (to be combined with the entry below)
    df.iloc[0, 1:] = df.iloc[0, 1:].fillna(method='ffill')

    # Rename columns by combining first and second row
    df.columns = [f"{first} {second}".strip() for first, second in zip(df.iloc[0], df.iloc[1])]

    # Name first column as 'suburb'
    df = df.rename(columns={df.columns[0]: 'suburb'})

    # Standardise column names 
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Drop the first two rows as they were used for column names
    df = df.drop(index=[0, 1])

    return df

# Format all the dataframes
one_bed_flat = format_df(one_bed_flat)
two_bed_flat = format_df(two_bed_flat)
three_bed_flat = format_df(three_bed_flat)

two_bed_house = format_df(two_bed_house)
three_bed_house = format_df(three_bed_house)
four_bed_house = format_df(four_bed_house)

all_properties = format_df(all_properties)

In [33]:
mask = one_bed_flat['mar_2000_count'] == '-'

rows_with_dash = one_bed_flat[mask]

rows_with_dash

Unnamed: 0,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,mar_2001_count,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
8,Docklands,-,-,-,-,-,-,-,-,-,...,1197,460,1140,490,1103,510,1091,525,1086,550
31,Canterbury-Surrey Hills-Mont Albert,-,-,-,-,11,160,-,-,-,...,24,390,30,388,35,400,39,420,47,420
46,Brighton East,-,-,-,-,11,150,-,-,-,...,-,-,-,-,11,227,12,268,13,310
60,Melton,-,-,-,-,-,-,-,-,-,...,39,350,36,360,36,369,39,369,34,369
64,Sydenham,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
73,Craigieburn,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
77,Keilor,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
89,Mill Park-Epping,-,-,-,-,-,-,-,-,-,...,95,229,87,244,84,249,92,280,51,350
93,Thomastown-Lalor,-,-,-,-,19,95,23,95,25,...,32,310,31,320,28,323,28,333,23,335
95,Whittlesea,-,-,-,-,-,-,-,-,-,...,11,300,12,310,-,-,-,-,-,-


In [None]:
# Save all the formatted csv's into curated/raw(?) folder

'''
# Curated
one_bed_flat.to_csv('../data/curated/one_bed_flat_rent_history.csv')
two_bed_flat.to_csv('../data/curated/two_bed_flat_rent_history.csv')
three_bed_flat.to_csv('../data/curated/three_bed_flat_rent_history.csv')

two_bed_house.to_csv('../data/curated/two_bed_house_rent_history.csv')
three_bed_house.to_csv('../data/curated/three_bed_house_rent_history.csv')
four_bed_house.to_csv('../data/curated/four_bed_house_rent_history.csv')

all_properties.to_csv('../data/curated/all_properties_rent_history.csv')

# Raw
one_bed_flat.to_csv('../data/raw/one_bed_flat_rent_history.csv')
two_bed_flat.to_csv('../data/raw/two_bed_flat_rent_history.csv')
three_bed_flat.to_csv('../data/raw/three_bed_flat_rent_history.csv')

two_bed_house.to_csv('../data/raw/two_bed_house_rent_history.csv')
three_bed_house.to_csv('../data/raw/three_bed_house_rent_history.csv')
four_bed_house.to_csv('../data/raw/four_bed_house_rent_history.csv')

all_properties.to_csv('../data/raw/all_properties_rent_history.csv')
'''