### Import Libraries & Read Data

In [8]:
import warnings
import pandas as pd

warnings.filterwarnings('ignore')

# Flats
one_bed_flat = pd.read_csv('../data/landing/rent_history_1 bedroom flat.csv')
two_bed_flat = pd.read_csv('../data/landing/rent_history_2 bedroom flat.csv')
three_bed_flat = pd.read_csv('../data/landing/rent_history_3 bedroom flat.csv')

# Houses
two_bed_house = pd.read_csv('../data/landing/rent_history_2 bedroom house.csv')
three_bed_house = pd.read_csv('../data/landing/rent_history_3 bedroom house.csv')
four_bed_house = pd.read_csv('../data/landing/rent_history_4 bedroom house.csv')

# All properties
all_properties = pd.read_csv('../data/landing/rent_history_All properties.csv')

### Format Column Names Correctly

In [9]:
def format_df(df):
    '''
        A function that preprocesses the dataframes, particularly its columns to the desired format.
        The df, originally from a xlsx file, had mismatched columns. 
    '''
    # Drop the first column (unimportant)
    df = df.drop(columns=df.columns[0])

    # Duplicate month to the column to the right (to be combined with the entry below)
    df.iloc[0, 1:] = df.iloc[0, 1:].fillna(method='ffill')

    # Rename columns by combining first and second row
    df.columns = [f"{first} {second}".strip() for first, second in zip(df.iloc[0], df.iloc[1])]

    # Name first column as 'suburb'
    df = df.rename(columns={df.columns[0]: 'suburb'})

    # Standardise column names 
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Drop the first two rows as they were used for column names
    df = df.drop(index=[0, 1])

    return df

# Format all the dataframes
one_bed_flat = format_df(one_bed_flat)
two_bed_flat = format_df(two_bed_flat)
three_bed_flat = format_df(three_bed_flat)

two_bed_house = format_df(two_bed_house)
three_bed_house = format_df(three_bed_house)
four_bed_house = format_df(four_bed_house)

all_properties = format_df(all_properties)
all_properties

Unnamed: 0,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,mar_2001_count,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
2,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,1178,275,1208,...,796,545,740,550,730,600,720,600,671,650
3,Armadale,733,200,737,200,738,205,739,210,718,...,757,490,687,500,639,525,594,560,566,560
4,Carlton North,864,260,814,260,799,265,736,270,718,...,497,620,495,630,467,650,418,670,384,680
5,Carlton-Parkville,1303,251,1278,260,1280,260,1301,260,1260,...,2953,500,2755,530,2687,550,2662,550,2543,570
6,CBD-St Kilda Rd,2132,320,2264,320,2358,320,2361,320,2591,...,13568,550,13505,580,13552,600,13564,620,13582,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Wanagaratta,705,125,671,125,631,130,623,130,602,...,535,380,555,390,565,390,593,395,580,400
157,Warragul,385,130,367,135,382,135,366,135,365,...,507,440,542,450,558,450,543,460,541,470
158,Warrnambool,1266,130,1229,135,1204,135,1135,135,1069,...,881,420,861,430,846,450,844,460,840,460
159,Wodonga,1446,145,1439,145,1468,150,1449,150,1405,...,1205,410,1187,420,1164,420,1155,430,1139,450


### Format Suburb Column Correctly

In [10]:
def format_suburb_col(df, col='suburb', separator='-'):

    '''This function iterates through every row in the dataframe and ensures
    that each row represents a unique suburb. This is done by splitting the 
    combined suburb names based on the '-' seperator. Then the corresponding
    data columns are copied into the new and unique suburb row.'''
    
    formatted_rows = []

    # Iterate through each row 
    for index, row in df.iterrows():
        # Split the suburb column name based on '-'
        suburbs = row[col].split(separator)
        # A new row is created for each suburb with the same corresponding rows
        for suburb in suburbs:
            new_row = row.copy()
            new_row[col] = suburb.strip()  
            formatted_rows.append(new_row)
    
    formatted_df = pd.DataFrame(formatted_rows)
    
    return formatted_df

In [11]:
# Format all the dataframes
one_bed_flat = format_suburb_col(one_bed_flat)
two_bed_flat = format_suburb_col(two_bed_flat)
three_bed_flat = format_suburb_col(three_bed_flat)

two_bed_house = format_suburb_col(two_bed_house)
three_bed_house = format_suburb_col(three_bed_house)
four_bed_house = format_suburb_col(four_bed_house)

all_properties = format_suburb_col(all_properties)
# Lower case all suburb names 
all_properties['suburb'] = all_properties['suburb'].str.lower()
all_properties

Unnamed: 0,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,mar_2001_count,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
2,albert park,1143,260,1134,260,1177,270,1178,275,1208,...,796,545,740,550,730,600,720,600,671,650
2,middle park,1143,260,1134,260,1177,270,1178,275,1208,...,796,545,740,550,730,600,720,600,671,650
2,west st kilda,1143,260,1134,260,1177,270,1178,275,1208,...,796,545,740,550,730,600,720,600,671,650
3,armadale,733,200,737,200,738,205,739,210,718,...,757,490,687,500,639,525,594,560,566,560
4,carlton north,864,260,814,260,799,265,736,270,718,...,497,620,495,630,467,650,418,670,384,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,wanagaratta,705,125,671,125,631,130,623,130,602,...,535,380,555,390,565,390,593,395,580,400
157,warragul,385,130,367,135,382,135,366,135,365,...,507,440,542,450,558,450,543,460,541,470
158,warrnambool,1266,130,1229,135,1204,135,1135,135,1069,...,881,420,861,430,846,450,844,460,840,460
159,wodonga,1446,145,1439,145,1468,150,1449,150,1405,...,1205,410,1187,420,1164,420,1155,430,1139,450


### Handling Missing Values

In [13]:
# Find all the rows with null values (defined by '-')
null_rows = all_properties[(all_properties == '-').any(axis=1)]

# Find exactly which columns in the rows contain the '-'
dash_cols = null_rows.apply(lambda row: ', '.join(row.index[row == '-']), axis=1)

print(list(dash_cols))
null_rows

['mar_2000_count, mar_2000_median, sep_2000_count, sep_2000_median, dec_2000_count, dec_2000_median, mar_2001_count, mar_2001_median, jun_2001_count, jun_2001_median, sep_2001_count, sep_2001_median, dec_2001_count, dec_2001_median']


Unnamed: 0,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,mar_2001_count,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
8,docklands,-,-,10,340,-,-,-,-,-,...,2789,550,2750,590,2714,610,2664,630,2644,650


In [14]:
# Save all the formatted csv's into curated/raw(?) folder

'''
# Curated
one_bed_flat.to_csv('../data/curated/one_bed_flat_rent_history.csv')
two_bed_flat.to_csv('../data/curated/two_bed_flat_rent_history.csv')
three_bed_flat.to_csv('../data/curated/three_bed_flat_rent_history.csv')

two_bed_house.to_csv('../data/curated/two_bed_house_rent_history.csv')
three_bed_house.to_csv('../data/curated/three_bed_house_rent_history.csv')
four_bed_house.to_csv('../data/curated/four_bed_house_rent_history.csv')

all_properties.to_csv('../data/curated/all_properties_rent_history.csv')

# Raw
one_bed_flat.to_csv('../data/raw/one_bed_flat_rent_history.csv')
two_bed_flat.to_csv('../data/raw/two_bed_flat_rent_history.csv')
three_bed_flat.to_csv('../data/raw/three_bed_flat_rent_history.csv')

two_bed_house.to_csv('../data/raw/two_bed_house_rent_history.csv')
three_bed_house.to_csv('../data/raw/three_bed_house_rent_history.csv')
four_bed_house.to_csv('../data/raw/four_bed_house_rent_history.csv')

all_properties.to_csv('../data/raw/all_properties_rent_history.csv')
'''

"\n# Curated\none_bed_flat.to_csv('../data/curated/one_bed_flat_rent_history.csv')\ntwo_bed_flat.to_csv('../data/curated/two_bed_flat_rent_history.csv')\nthree_bed_flat.to_csv('../data/curated/three_bed_flat_rent_history.csv')\n\ntwo_bed_house.to_csv('../data/curated/two_bed_house_rent_history.csv')\nthree_bed_house.to_csv('../data/curated/three_bed_house_rent_history.csv')\nfour_bed_house.to_csv('../data/curated/four_bed_house_rent_history.csv')\n\nall_properties.to_csv('../data/curated/all_properties_rent_history.csv')\n\n# Raw\none_bed_flat.to_csv('../data/raw/one_bed_flat_rent_history.csv')\ntwo_bed_flat.to_csv('../data/raw/two_bed_flat_rent_history.csv')\nthree_bed_flat.to_csv('../data/raw/three_bed_flat_rent_history.csv')\n\ntwo_bed_house.to_csv('../data/raw/two_bed_house_rent_history.csv')\nthree_bed_house.to_csv('../data/raw/three_bed_house_rent_history.csv')\nfour_bed_house.to_csv('../data/raw/four_bed_house_rent_history.csv')\n\nall_properties.to_csv('../data/raw/all_properti