In [1113]:
import pandas as pd

# Read the CSV file into a DataFrame
recent_rent = pd.read_csv('../../data/landing/2024_rent.csv')
history_rent = pd.read_csv('../../data/landing/history_rent.csv')
postcode = pd.read_csv('../../data/raw/external/australian_postcodes.csv')

# Merge historical data and recent data together

## Assumption for internal data
-  suburbs in historical and recent data  represent the same geographical areas

In [1114]:
# Filter the DataFrame to keep only rows where 'State' is 'VIC'
postcode = postcode[postcode['state'] == 'VIC']
# Convert 'locality' to lowercase and rename to 'Suburb'
postcode.loc[:,'locality'] = postcode['locality'].str.lower()
postcode.rename(columns={'locality': 'Suburb'}, inplace=True)
postcode.head(3)

Unnamed: 0,id,postcode,Suburb,state,long,lat,dc,type,status,sa3,...,altitude,chargezone,phn_code,phn_name,lgaregion,lgacode,electorate,electoraterating,sed_code,sed_name
6202,4746,3000,melbourne,VIC,144.982585,-37.814437,CITY DELIVERY CENTRE,Delivery Area,Updated 17-Mar-2024 AUSPOST,20604.0,...,27.332188,V1,PHN201,North Western Melbourne,Melbourne,24600.0,Melbourne,Inner Metropolitan,24703.0,Melbourne (Northern Metropolitan)
6203,4747,3001,melbourne,VIC,144.982585,-37.814437,CITY MAIL PROCESSING CENTRE,Post Office Boxes,Updated 17-Mar-2024 AUSPOST,20605.0,...,27.332188,V1,PHN203,,Melbourne,24600.0,Maribyrnong,,,
6204,4748,3002,east melbourne,VIC,144.982585,-37.814437,CITY DELIVERY CENTRE,Delivery Area,Updated 17-Mar-2024 AUSPOST,20604.0,...,27.332188,V1,PHN201,North Western Melbourne,Melbourne,24600.0,Melbourne,Inner Metropolitan,24703.0,Melbourne (Northern Metropolitan)


In [1115]:
# Splitting the 'Location' column by '-' and expanding it into multiple rows
history_rent = history_rent.assign(Location=history_rent['Location'].str.split('-')).explode('Location').reset_index(drop=True)

In [1116]:
history_rent = history_rent.drop(columns=['Zone'])
history_rent.head(3)

Unnamed: 0,Location,Year,Month,Count,Median,Bed,Apartment
0,Albert Park,2000,Dec,369,175,1,1
1,Middle Park,2000,Dec,369,175,1,1
2,West St Kilda,2000,Dec,369,175,1,1


In [1117]:
# Convert the 'Median' column to numeric (handle errors), round floats, fill NaN with a default value, then convert to integer
history_rent['Median'] = pd.to_numeric(history_rent['Median'], errors='coerce').round().fillna(0).astype(int)

# Rename Location into Suburb
history_rent = history_rent.rename(columns={'Location':'Suburb'})

# Convert 'locality' to lowercase and rename to 'Suburb'
history_rent.loc[:,'Suburb'] = history_rent['Suburb'].str.lower()

# Replace 'mt' with 'mountain' in the 'Suburb' column
history_rent['Suburb'] = history_rent['Suburb'].str.replace(r'\bmt\b', 'mount', regex=True)

# Replace 'newcombe' with 'newcomb' in the 'Suburb' column
history_rent['Suburb'] = history_rent['Suburb'].str.replace('newcombe', 'newcomb')

# Replace 'wanagaratta' with 'wangaratta' in the 'Suburb' column
history_rent['Suburb'] = history_rent['Suburb'].str.replace('wanagaratta', 'wangaratta')



In [1118]:
import re


# Function to move direction (e.g., "West", "East", etc.) to the beginning of the suburb name
def move_direction_to_start(suburb):
    # Define a regex pattern to find direction words (e.g., "West", "East", "North", "South")
    direction_pattern = r'\b(west|east|north|south)\b'
    
    # Search for direction words at the end of the suburb name
    match = re.search(direction_pattern, suburb)
    
    if match:
        direction = match.group(0)
        # Remove the direction from the end and place it at the start
        suburb_without_direction = re.sub(direction_pattern, '', suburb).strip()
        return f'{direction} {suburb_without_direction}'
    
    return suburb

# Apply the function to clean suburb names in both datasets
history_rent['Cleaned_Suburb'] = history_rent['Suburb'].apply(move_direction_to_start)
postcode['Cleaned_Suburb'] = postcode['Suburb'].apply(move_direction_to_start)

# Perform the merge using the cleaned suburb names
history_rent = pd.merge(history_rent, postcode[['Cleaned_Suburb', 'postcode']], on='Cleaned_Suburb', how='left')

# Drop the helper column 'Cleaned_Suburb' if needed
history_rent = history_rent.drop(columns=['Cleaned_Suburb'])


# Display the merged data
print(history_rent.head())

          Suburb  Year Month  Count  Median  Bed  Apartment  postcode
0    albert park  2000   Dec    369     175    1          1    3206.0
1    middle park  2000   Dec    369     175    1          1    3206.0
2  west st kilda  2000   Dec    369     175    1          1    3182.0
3    albert park  2000   Jun    347     165    1          1    3206.0
4    middle park  2000   Jun    347     165    1          1    3206.0


In [1119]:
p_3825 = history_rent[history_rent['postcode']==3825.0]
y_2000 = p_3825[p_3825['Year']==2000]
y_2023 = p_3825[p_3825['Year']==2023]

In [1122]:
# Replace the postcode with '3004' for the suburb 'cbd' and 'st kilda rd'
history_rent.loc[history_rent['Suburb'] == 'cbd', 'postcode'] = '3004'
history_rent.loc[history_rent['Suburb'] == 'st kilda rd', 'postcode'] = '3004'

# Replace the postcode with '3132' for the suburb 'yarra ranges'
history_rent.loc[history_rent['Suburb'] == 'yarra ranges', 'postcode'] = '3132'


In [1123]:
# Convert 'postcode' to integer
history_rent['postcode'] = pd.to_numeric(history_rent['postcode'], errors='coerce').astype('Int64')

# Drop 'Suburb' and 'Count' columns
history_rent = history_rent.drop(columns=['Suburb', 'Count'])

# Remove duplicates
rent = history_rent.drop('Month',axis=1)
rent = rent.drop_duplicates()
rent.head(3)

Unnamed: 0,Year,Median,Bed,Apartment,postcode
0,2000,175,1,1,3206
2,2000,175,1,1,3182
3,2000,165,1,1,3206


In [1124]:
# Rename Postcode into postcode
recent_rent = recent_rent.rename(columns={'Postcode':'postcode'})

# match locations 
recent_rent = recent_rent[recent_rent['postcode'].isin(history_rent['postcode'])]

# clean recent rent data

# Drop 'Count' columns
recent_rent = recent_rent.drop(columns='Count')
# Remove duplicates
recent_rent = recent_rent.drop_duplicates()
recent_rent.head(3)

Unnamed: 0,postcode,Bed,Apartment,Median,Year,Month
3,3002,1,1,475.0,2024,Aug
4,3002,2,1,660.0,2024,Aug
5,3002,3,1,975.0,2024,Aug


In [1125]:
rent = pd.concat([history_rent, recent_rent], ignore_index=True)
rent = rent.drop('Month',axis=1)
rent = rent.drop_duplicates()

# Merge rent data with external data (park, school, transportation, population and income)

## Assumption for external data
- Facilities located on a boundary are assumed to belong to that postcode area.

In [1126]:
park = pd.read_csv('../../data/landing/parks.csv')
# Rename POSTCODE into postcode
park = park.rename(columns={'POSTCODE':'postcode'})
# Merge the dataframes on the postcode
rent = pd.merge(rent, park, on='postcode', how='left') 
# Replace NaN with 0
rent.fillna(0, inplace=True)

In [1127]:
school_recent = pd.read_csv('../../data/landing/schoolscount2023.csv')
school_past = pd.read_csv('../../data/landing/schoolscount2018.csv')

rent_recent = rent[rent['Year'] > 2018]
rent_past = rent[rent['Year'] < 2019]

# Merge the dataframes on the postcode
rent_recent = pd.merge(rent_recent, school_recent, on='postcode', how='left') 
rent_past = pd.merge(rent_past, school_past, on='postcode', how='left') 

# Concatenate the two DataFrames along rows (axis=0)
rent = pd.concat([rent_recent, rent_past], ignore_index=True)

# Replace NaN with 0
rent.fillna(0, inplace=True)

In [1128]:
transportation = pd.read_csv('../../data/landing/transportation.csv')

# Merge the dataframes on the postcode
rent = pd.merge(rent, transportation, on='postcode', how='left') 

# Replace NaN with 0
rent.fillna(0, inplace=True)

In [1129]:
# load population data 
population = pd.read_csv('../../data/landing/population_percent_change_2000-27.csv')

# Rename 'Postcode' column to 'postcode' and 'year' column to 'Year'
population = population.rename(columns={'Postcode': 'postcode'})

# Reshape the DataFrame using pd.melt
population = pd.melt(population, id_vars=['postcode'], var_name='Year', value_name='population')

# Convert 'year' to an integer (since it will be treated as a string after melting)
population['Year'] = population['Year'].astype(int)


rent = pd.merge(rent, population, on=['postcode','Year'], how='inner') 
rent.head(5)

Unnamed: 0,Year,Median,Bed,Apartment,postcode,park_count,mean_park_area,Camp,Language,Primary,Secondary,Special,school_total,tram_count,bus_count,train_count,population
0,2019,370.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102
1,2019,350.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102
2,2019,360.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102
3,2019,515.0,2,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102
4,2019,500.0,2,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102


In [1130]:
# load income data 
income = pd.read_csv('../../data/landing/income_2000-27.csv')

# Rename 'Postcode' column to 'postcode'
income = income.rename(columns={'Postcode': 'postcode'})

# Reshape the DataFrame using pd.melt
income = pd.melt(income, id_vars=['postcode'], var_name='Year', value_name='income')

# Convert 'year' to an integer (since it will be treated as a string after melting)
income['Year'] = income['Year'].astype(int)

rent = pd.merge(rent, income, on=['postcode','Year'], how='inner') 
rent.head(5)

Unnamed: 0,Year,Median,Bed,Apartment,postcode,park_count,mean_park_area,Camp,Language,Primary,Secondary,Special,school_total,tram_count,bus_count,train_count,population,income
0,2019,370.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102,119591
1,2019,350.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102,119591
2,2019,360.0,1,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102,119591
3,2019,515.0,2,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102,119591
4,2019,500.0,2,1,3206,2.0,109.9035,0.0,0.0,3.0,1.0,0.0,4.0,33.0,0.0,0.0,0.021102,119591


In [1131]:
train = rent[rent['Year']<2024]
cur_rent = rent[rent['Year']==2024]

In [1132]:
# Save the DataFrame to the specified path in CSV format
train.to_csv('../../data/curated/train_data.csv', index=False)
cur_rent.to_csv('../../data/curated/cur_rent.csv', index=False)

The final training data has internal and external features including rental prices.