In [112]:
import pandas as pd
import re

# Preprocessing current data for prediction

In [113]:
df = pd.read_csv('../data/raw/location_rent_with_coordinates.csv')

In [114]:
# Create a condition to check if the name has a flat number (pattern "number/")

df['Apartment'] = df['name'].apply(lambda x: 1 if '/' in x else 0)
df.head(5)

Unnamed: 0,name,Bed,weekly_rent,Apartment
0,"1208/50 Albert Street, South Melbourne VIC 3205",1,520.0,1
1,"64 Mills Street, Albert Park VIC 3206",3,1495.0,0
2,"11 Barnato St, Weir Views VIC 3338",4,460.0,0
3,"1104/70 Southbank Boulevard, Southbank VIC 3006",1,420.0,1
4,"167 Charman Road, Beaumaris VIC 3193",4,950.0,0


In [115]:
# Function to extract the suburb and postcode from the 'name' column
def extract_location_postcode(address):
    # Regex pattern to capture the suburb name and postcode (VIC followed by 4 digits)
    pattern = r'(?P<suburb>[A-Za-z\s]+) VIC (?P<postcode>\d{4})'
    match = re.search(pattern, address)
    if match:
        return match.group('suburb').strip(), match.group('postcode')
    return None, None

# Apply the function to each row in the 'name' column
df['Location'], df['Postcode'] = zip(*df['name'].apply(extract_location_postcode))

# delete name column
df = df.drop(columns=['name'])

# Display the modified dataframe
df.head()


Unnamed: 0,Bed,weekly_rent,Apartment,Location,Postcode
0,1,520.0,1,South Melbourne,3205
1,3,1495.0,0,Albert Park,3206
2,4,460.0,0,Weir Views,3338
3,1,420.0,1,Southbank,3006
4,4,950.0,0,Beaumaris,3193


In [116]:
df = df.groupby(['Location', 'Bed', 'Apartment']).agg(
    Count=('weekly_rent', 'size'),    # Count number of rows
    Median=('weekly_rent', 'median')  # Median of weekly_rent
).reset_index()


In [117]:
# Manually add 'Year' and 'Month' columns with a fixed value (2023 and August)

df['Year'] = 2024
df['Month'] = 'Aug'

In [118]:
df.head()

Unnamed: 0,Location,Bed,Apartment,Count,Median,Year,Month
0,Abbotsford,1,1,2,587.5,2024,Aug
1,Abbotsford,2,0,2,752.5,2024,Aug
2,Abbotsford,2,1,12,707.5,2024,Aug
3,Abbotsford,3,0,1,900.0,2024,Aug
4,Abbotsford,3,1,3,900.0,2024,Aug


In [119]:
# Define the path for the "curated data" folder
file_path = '../data/landing/2024_rent.csv'

# Save the DataFrame to the specified path in CSV format
df.to_csv(file_path, index=False)

# Preprocessing current data for visualization

In [120]:
current = pd.read_csv('../data/landing/property_with_nearest_distance.csv')

In [121]:
# Create a condition to check if the name has a flat number (pattern "number/")

current['Apartment'] = current['name'].apply(lambda x: 1 if '/' in x else 0)
current.head(3)

Unnamed: 0,name,Bed,weekly_rent,latitude,longitude,geometry,nearest_tram_stop,nearest_tram_stop_distance_km,nearest_train_stop,nearest_train_stop_distance_km,nearest_bus_stop,nearest_bus_stop_distance_km,nearest_school,nearest_school_distance_km,Apartment
0,"1208/50 Albert Street, South Melbourne VIC 3205",1,520.0,-37.834344,144.955904,POINT (2496118.3794249045 2407409.7981049973),127-South Melbourne Station/Light Rail (South ...,0.136816,67-Camberwell Girls Grammar/Burke Rd (Hawthorn...,1.863573,Ferrars St/Dorcas St (South Melbourne),0.03479,Galilee Regional Catholic Primary School,0.223329,1
1,"64 Mills Street, Albert Park VIC 3206",3,1495.0,-37.846426,144.958009,POINT (2496304.26476775 2406068.9190204293),135-Richardson St/Mills St (Middle Park),0.090635,49-Buchanan Ave/Doncaster Rd (Balwyn North),2.824165,Middle Park PS/Richardson St (Middle Park),0.095911,Middle Park Primary School,0.158652,0
2,"11 Barnato St, Weir Views VIC 3338",4,460.0,-37.718775,144.554187,POINT (2460697.114285021 2420145.1730066407),49-Central Park Ave/Cordite Ave (Maribyrnong),29.072242,46-Orrong Rd/Glenhuntly Rd (Elsternwick),17.817118,Hume Ave/Rees Rd (Melton South),1.047037,Al Iman College,1.632357,0


In [122]:
# Apply the function to each row in the 'name' column
current['Location'], current['Postcode'] = zip(*current['name'].apply(extract_location_postcode))

# delete name column
current = current.drop(columns=['name'])

# Display the modified dataframe
current.head(3)

Unnamed: 0,Bed,weekly_rent,latitude,longitude,geometry,nearest_tram_stop,nearest_tram_stop_distance_km,nearest_train_stop,nearest_train_stop_distance_km,nearest_bus_stop,nearest_bus_stop_distance_km,nearest_school,nearest_school_distance_km,Apartment,Location,Postcode
0,1,520.0,-37.834344,144.955904,POINT (2496118.3794249045 2407409.7981049973),127-South Melbourne Station/Light Rail (South ...,0.136816,67-Camberwell Girls Grammar/Burke Rd (Hawthorn...,1.863573,Ferrars St/Dorcas St (South Melbourne),0.03479,Galilee Regional Catholic Primary School,0.223329,1,South Melbourne,3205
1,3,1495.0,-37.846426,144.958009,POINT (2496304.26476775 2406068.9190204293),135-Richardson St/Mills St (Middle Park),0.090635,49-Buchanan Ave/Doncaster Rd (Balwyn North),2.824165,Middle Park PS/Richardson St (Middle Park),0.095911,Middle Park Primary School,0.158652,0,Albert Park,3206
2,4,460.0,-37.718775,144.554187,POINT (2460697.114285021 2420145.1730066407),49-Central Park Ave/Cordite Ave (Maribyrnong),29.072242,46-Orrong Rd/Glenhuntly Rd (Elsternwick),17.817118,Hume Ave/Rees Rd (Melton South),1.047037,Al Iman College,1.632357,0,Weir Views,3338


In [123]:
current = current.drop(columns=['latitude','longitude','geometry','nearest_tram_stop','nearest_train_stop','nearest_bus_stop','nearest_school','Location'])
current.head(3)

Unnamed: 0,Bed,weekly_rent,nearest_tram_stop_distance_km,nearest_train_stop_distance_km,nearest_bus_stop_distance_km,nearest_school_distance_km,Apartment,Postcode
0,1,520.0,0.136816,1.863573,0.03479,0.223329,1,3205
1,3,1495.0,0.090635,2.824165,0.095911,0.158652,0,3206
2,4,460.0,29.072242,17.817118,1.047037,1.632357,0,3338


In [124]:
# Define the path for the "curated data" folder
file_path = '../data/landing/current_visual.csv'

# Save the DataFrame to the specified path in CSV format
current.to_csv(file_path, index=False)