In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
recent_rent = pd.read_csv('../data/landing/2024_rent.csv')
history_rent = pd.read_csv('../data/landing/history_rent.csv')
postcode = pd.read_csv('../data/raw/external/australian_postcodes.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/landing/2024_rent.csv'

# Merge historical data and recent data together

In [None]:
# Splitting the 'Location' column by '-' and expanding it into multiple rows
history_rent = history_rent.assign(Location=history_rent['Location'].str.split('-')).explode('Location').reset_index(drop=True)

In [None]:
history_rent = history_rent.drop(columns=['Zone'])
history_rent.head(3)

Unnamed: 0,Location,Year,Month,Count,Median,Bed,Apartment
0,Albert Park,2000,Dec,369,175,1,1
1,Middle Park,2000,Dec,369,175,1,1
2,West St Kilda,2000,Dec,369,175,1,1


In [None]:
# Concatenate the two DataFrames along rows (axis=0)
rent = pd.concat([history_rent, recent_rent], ignore_index=True)

# Convert the 'Median' column to numeric (handle errors), round floats, fill NaN with a default value, then convert to integer
rent['Median'] = pd.to_numeric(rent['Median'], errors='coerce').round().fillna(0).astype(int)


# Rename Location into Suburb
rent = rent.rename(columns={'Location':'Suburb'})

# Convert 'locality' to lowercase and rename to 'Suburb'
rent.loc[:,'Suburb'] = rent['Suburb'].str.lower()

# Replace 'mt' with 'mountain' in the 'Suburb' column
rent['Suburb'] = rent['Suburb'].str.replace(r'\bmt\b', 'mount', regex=True)

# Replace 'newcombe' with 'newcomb' in the 'Suburb' column
rent['Suburb'] = rent['Suburb'].str.replace('newcombe', 'newcomb')

# Replace 'wanagaratta' with 'newcomb' in the 'Suburb' column
rent['Suburb'] = rent['Suburb'].str.replace('wanagaratta', 'wangaratta')

# Replace 'gladstone st south melbourne' with 'south melbourne' in the 'Suburb' column
rent['Suburb'] = rent['Suburb'].str.replace('gladstone st south melbourne', 'south melbourne')
rent.head(3)


Unnamed: 0,Suburb,Year,Month,Count,Median,Bed,Apartment
0,albert park,2000,Dec,369,175,1,1
1,middle park,2000,Dec,369,175,1,1
2,west st kilda,2000,Dec,369,175,1,1


In [None]:
# Filter the DataFrame to keep only rows where 'State' is 'VIC'
postcode = postcode[postcode['state'] == 'VIC']
# Convert 'locality' to lowercase and rename to 'Suburb'
postcode.loc[:,'locality'] = postcode['locality'].str.lower()
postcode.rename(columns={'locality': 'Suburb'}, inplace=True)
postcode.head(3)

Unnamed: 0,id,postcode,Suburb,state,long,lat,dc,type,status,sa3,...,altitude,chargezone,phn_code,phn_name,lgaregion,lgacode,electorate,electoraterating,sed_code,sed_name
6202,4746,3000,melbourne,VIC,144.982585,-37.814437,CITY DELIVERY CENTRE,Delivery Area,Updated 17-Mar-2024 AUSPOST,20604.0,...,27.332188,V1,PHN201,North Western Melbourne,Melbourne,24600.0,Melbourne,Inner Metropolitan,24703.0,Melbourne (Northern Metropolitan)
6203,4747,3001,melbourne,VIC,144.982585,-37.814437,CITY MAIL PROCESSING CENTRE,Post Office Boxes,Updated 17-Mar-2024 AUSPOST,20605.0,...,27.332188,V1,PHN203,,Melbourne,24600.0,Maribyrnong,,,
6204,4748,3002,east melbourne,VIC,144.982585,-37.814437,CITY DELIVERY CENTRE,Delivery Area,Updated 17-Mar-2024 AUSPOST,20604.0,...,27.332188,V1,PHN201,North Western Melbourne,Melbourne,24600.0,Melbourne,Inner Metropolitan,24703.0,Melbourne (Northern Metropolitan)


In [None]:
import re


# Function to move direction (e.g., "West", "East", etc.) to the beginning of the suburb name
def move_direction_to_start(suburb):
    # Define a regex pattern to find direction words (e.g., "West", "East", "North", "South")
    direction_pattern = r'\b(west|east|north|south)\b'
    
    # Search for direction words at the end of the suburb name
    match = re.search(direction_pattern, suburb)
    
    if match:
        direction = match.group(0)
        # Remove the direction from the end and place it at the start
        suburb_without_direction = re.sub(direction_pattern, '', suburb).strip()
        return f'{direction} {suburb_without_direction}'
    
    return suburb

# Apply the function to clean suburb names in both datasets
rent['Cleaned_Suburb'] = rent['Suburb'].apply(move_direction_to_start)
postcode['Cleaned_Suburb'] = postcode['Suburb'].apply(move_direction_to_start)

# Perform the merge using the cleaned suburb names
rent = pd.merge(rent, postcode[['Cleaned_Suburb', 'postcode']], on='Cleaned_Suburb', how='left')

# Drop the helper column 'Cleaned_Suburb' if needed
rent = rent.drop(columns=['Cleaned_Suburb'])


# Display the merged data
print(rent.head())

          Suburb  Year Month Count  Median  Bed  Apartment  postcode
0    albert park  2000   Dec   369     175    1          1    3206.0
1    middle park  2000   Dec   369     175    1          1    3206.0
2  west st kilda  2000   Dec   369     175    1          1    3182.0
3    albert park  2000   Jun   347     165    1          1    3206.0
4    middle park  2000   Jun   347     165    1          1    3206.0


In [None]:
# Replace the postcode with '3030' for the suburb 'sanctuary lakes'
rent.loc[rent['Suburb'] == 'sanctuary lakes', 'postcode'] = '3030'

# Replace the postcode with '3004' for the suburb 'cbd' and 'st kilda rd'
rent.loc[rent['Suburb'] == 'cbd', 'postcode'] = '3004'
rent.loc[rent['Suburb'] == 'st kilda rd', 'postcode'] = '3004'

# Replace the postcode with '3132' for the suburb 'yarra ranges'
rent.loc[rent['Suburb'] == 'yarra ranges', 'postcode'] = '3132'


In [None]:
# Convert 'postcode' to integer
rent['postcode'] = pd.to_numeric(rent['postcode'], errors='coerce').astype('Int64')

# Drop 'Suburb' and 'Count' columns
rent = rent.drop(columns=['Suburb', 'Count'])

# Remove duplicates
rent = rent.drop_duplicates()
rent.head(3)

Unnamed: 0,Year,Month,Median,Bed,Apartment,postcode
0,2000,Dec,175,1,1,3206
2,2000,Dec,175,1,1,3182
3,2000,Jun,165,1,1,3206


# Merge rent data with external data (park, school, transportation, population and income)

In [None]:
park = pd.read_csv('../data/landing/parks.csv')
# Rename POSTCODE into postcode
park = park.rename(columns={'POSTCODE':'postcode'})
# Merge the dataframes on the postcode
rent = pd.merge(rent, park, on='postcode', how='inner') 

In [None]:
school_recent = pd.read_csv('../data/landing/schoolscount2023.csv')
school_past = pd.read_csv('../data/landing/schoolscount2018.csv')

rent_recent = rent[rent['Year'] > 2018]
rent_past = rent[rent['Year'] < 2019]

# Merge the dataframes on the postcode
rent_recent = pd.merge(rent_recent, school_recent, on='postcode', how='inner') 
rent_past = pd.merge(rent_past, school_past, on='postcode', how='inner') 

# Concatenate the two DataFrames along rows (axis=0)
rent = pd.concat([rent_recent, rent_past], ignore_index=True)

rent

Unnamed: 0,Year,Month,Median,Bed,Apartment,postcode,park_count,mean_park_area,Camp,Language,Primary,Secondary,Special,school_total
0,2019,Dec,370,1,1,3182,1,218.4940,0,0,4,3,0,7
1,2019,Jun,350,1,1,3182,1,218.4940,0,0,4,3,0,7
2,2019,Mar,350,1,1,3182,1,218.4940,0,0,4,3,0,7
3,2019,Sep,360,1,1,3182,1,218.4940,0,0,4,3,0,7
4,2020,Dec,350,1,1,3182,1,218.4940,0,0,4,3,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51696,2017,Sep,380,4,0,3690,5,521.0596,0,0,8,5,1,14
51697,2018,Dec,390,4,0,3690,5,521.0596,0,0,8,5,1,14
51698,2018,Jun,390,4,0,3690,5,521.0596,0,0,8,5,1,14
51699,2018,Mar,380,4,0,3690,5,521.0596,0,0,8,5,1,14


In [None]:
transportation = pd.read_csv('../data/landing/transportation.csv')

# Merge the dataframes on the postcode
rent = pd.merge(rent, transportation, on='postcode', how='inner') 

In [None]:
rent

Unnamed: 0,Year,Month,Median,Bed,Apartment,postcode,park_count,mean_park_area,Camp,Language,Primary,Secondary,Special,school_total,tram_count,bus_count,train_count
0,2019,Dec,370,1,1,3182,1,218.494000,0,0,4,3,0,7,64,39,0
1,2019,Jun,350,1,1,3182,1,218.494000,0,0,4,3,0,7,64,39,0
2,2019,Mar,350,1,1,3182,1,218.494000,0,0,4,3,0,7,64,39,0
3,2019,Sep,360,1,1,3182,1,218.494000,0,0,4,3,0,7,64,39,0
4,2020,Dec,350,1,1,3182,1,218.494000,0,0,4,3,0,7,64,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31593,2024,Aug,500,3,0,3750,1,1290.216000,0,0,4,2,0,6,0,24,0
31594,2024,Aug,620,4,0,3750,1,1290.216000,0,0,4,2,0,6,0,24,0
31595,2024,Aug,720,5,0,3750,1,1290.216000,0,0,4,2,0,6,0,24,0
31596,2024,Aug,1600,5,0,3115,1,687.622000,0,0,2,0,0,2,0,30,0


In [None]:
file_path = '../data/curated/rent.csv'

# Save the DataFrame to the specified path in CSV format
rent.to_csv(file_path, index=False)