In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
recent_rent = pd.read_csv('../data/curated/2024_rent.csv')
history_rent = pd.read_csv('../data/curated/history_rent.csv')
postcode = pd.read_csv('../data/raw/external/australian-postcodes.csv')

In [2]:
# Filter the DataFrame to keep only rows where 'State' is 'VIC'
postcode = postcode[postcode['State'] == 'VIC']
postcode.head(3)

Unnamed: 0,Postcode,Suburb,State,Lat,Lon
5584,3000,Melbourne,VIC,-37.81,144.97
5585,3001,Melbourne,VIC,-38.37,144.77
5586,3002,East Melbourne,VIC,-37.82,144.99


In [3]:
# Split the 'Location' column by the hyphen ('-'), expand=True returns separate columns
history_loc = history_rent['Location'].str.split('-', expand=True)

In [4]:
# Drop duplicate rows
history_loc = history_loc.drop_duplicates()

# Convert each row into a list, ignoring None values
history_loc = history_loc.apply(lambda row: [x for x in row if pd.notna(x)], axis=1).tolist()

In [5]:
# Step 1: Iterate over the location groups and group them together, keeping Bed and Apartment separate
combined_loc = []
for group in history_loc:
    # Filter rows in recent_rent that match any location in the group
    filtered_row = recent_rent[recent_rent['Location'].isin(group)]
    
    # Group by Bed and Apartment to handle cases with different Bed/Apartment combinations
    locations = filtered_row.groupby(['Bed', 'Apartment']).agg({
        'Location': lambda x: '-'.join(group),  # Join locations with '-'
        'Count': 'sum',                     # Sum the count
        'Median': 'mean'                    # Take the average of the median
    }).reset_index()
    
    # Append each grouped row to the combined_rows list
    combined_loc.extend(locations.to_dict('records'))

# Step 2: Create a new DataFrame from the combined rows
recent_rent = pd.DataFrame(combined_loc)


In [6]:
# Manually add 'Year' and 'Month' columns with a fixed value (2023 and August)

recent_rent['Year'] = 2024
recent_rent['Month'] = 'Aug'
recent_rent

Unnamed: 0,Bed,Apartment,Location,Count,Median,Year,Month
0,1,1,Albert Park-Middle Park-West St Kilda,1,575.0,2024,Aug
1,2,1,Albert Park-Middle Park-West St Kilda,2,687.5,2024,Aug
2,3,0,Albert Park-Middle Park-West St Kilda,8,1267.5,2024,Aug
3,4,0,Albert Park-Middle Park-West St Kilda,2,2200.0,2024,Aug
4,2,1,Altona,1,520.0,2024,Aug
...,...,...,...,...,...,...,...
475,8,0,Wodonga,1,250.0,2024,Aug
476,2,0,Yarraville-Seddon,1,700.0,2024,Aug
477,3,0,Yarraville-Seddon,1,780.0,2024,Aug
478,4,0,Yarraville-Seddon,2,950.0,2024,Aug


In [7]:
history_rent = history_rent.drop(columns=['Zone'])
history_rent.head(3)

Unnamed: 0,Location,Year,Month,Count,Median,Bed,Apartment
0,Albert Park-Middle Park-West St Kilda,2000,Dec,369,175,1,1
1,Albert Park-Middle Park-West St Kilda,2000,Jun,347,165,1,1
2,Albert Park-Middle Park-West St Kilda,2000,Mar,352,165,1,1


In [8]:
# Concatenate the two DataFrames along rows (axis=0)
rent = pd.concat([history_rent, recent_rent], ignore_index=True)

# Rename Location into Suburb
rent = rent.rename(columns={'Location':'Suburb'})
# Display the combined DataFrame
rent

Unnamed: 0,Suburb,Year,Month,Count,Median,Bed,Apartment
0,Albert Park-Middle Park-West St Kilda,2000,Dec,369,175,1,1
1,Albert Park-Middle Park-West St Kilda,2000,Jun,347,165,1,1
2,Albert Park-Middle Park-West St Kilda,2000,Mar,352,165,1,1
3,Albert Park-Middle Park-West St Kilda,2000,Sep,378,170,1,1
4,Albert Park-Middle Park-West St Kilda,2001,Dec,423,190,1,1
...,...,...,...,...,...,...,...
88243,Wodonga,2024,Aug,1,250.0,8,0
88244,Yarraville-Seddon,2024,Aug,1,700.0,2,0
88245,Yarraville-Seddon,2024,Aug,1,780.0,3,0
88246,Yarraville-Seddon,2024,Aug,2,950.0,4,0


In [9]:
file_path = '../data/curated/rent.csv'

# Save the DataFrame to the specified path in CSV format
rent.to_csv(file_path, index=False)