In [1]:
import pandas as pd
import re
import ast

## 1. Basic Data Engineering for Each Dataset

## 1.1 Property Data

### Load Data

In [2]:
property_df = pd.read_csv("../data/landing/properties.csv")

### Basic Information Of Data

In [3]:
for a_feature in list(property_df.columns):
    na_count = property_df[a_feature].isna().sum()
    print(f"Number of NA in {a_feature}: {na_count}")

Number of NA in property_type: 1
Number of NA in geo: 4970
Number of NA in age_distribution: 0
Number of NA in nearby_schools: 0
Number of NA in parking: 1
Number of NA in agency: 0
Number of NA in nbn_type: 1966
Number of NA in price: 1411
Number of NA in bedrooms: 1
Number of NA in location: 1
Number of NA in area: 0
Number of NA in bathrooms: 1


### Required Functions

In [4]:
def clean_price(price):
    if pd.isna(price) or price == '':
        return None
    
    # 使用改进的正则表达式
    match = re.search(r'(\d+(?:,\d+)*(?:\.\d+)?)\s*(pw|pcm|per week|per month)?', str(price), re.IGNORECASE)
    
    if match:
        # 移除非数字字符并转换为浮点数
        clean_price_str = re.sub(r'[^\d.]', '', match.group(1))
        
        try:
            price_value = float(clean_price_str)
        except ValueError:
            return None

        # 如果是每月价格，转换为每周价格
        if match.group(2) and ('m' in match.group(2).lower() or 'month' in match.group(2).lower()):
            price_value /= 4.3  # 假设一个月平均4.3周

        return round(price_value, 2)
    else:
        return None

def parse_location(location_str):
    if pd.isna(location_str):
        return pd.Series({'streetAddress': None, 'addressLocality': None, 'addressRegion': None, 'postalCode': None})
    location_dict = ast.literal_eval(location_str)
    return pd.Series(location_dict)

def parse_age_dist(age_dist_str):
    if pd.isna(age_dist_str):
        return None
    age_dist_dict = ast.literal_eval(age_dist_str)
    for key, value in age_dist_dict.items():
        value = value.strip().rstrip('%')
        value = float(value)
        if value >= 50:
            return key
    
    return 'balanced age' 

In [5]:
def clean_properties(property_df):
    property_df[['bedrooms', 'bathrooms', 'parking']] = property_df[['bedrooms', 'bathrooms', 'parking']].fillna(0)
    
    # price
    property_df['price'] = property_df['price'].apply(clean_price)

    # location
    location_df = property_df['location'].apply(parse_location)
    property_df = pd.concat([property_df.drop('location', axis=1), location_df], axis=1)
    
    # age distribution
    property_df['age_distribution'] = property_df['age_distribution'].apply(parse_age_dist)
    
    property_df = property_df.dropna(subset=['price', 'streetAddress', 'addressLocality', 'postalCode'])
    
    property_df = property_df[[
    'streetAddress', 'price', 'bedrooms', 'bathrooms', 'parking',
    'addressLocality', 'addressRegion', 'postalCode', 'property_type', 
    'nearby_schools', 'nbn_type', 'age_distribution', 'geo'
    ]]
    
    return property_df
    

### Apply

In [6]:
property_df = clean_properties(property_df)

In [7]:
output_path = "../data/raw/cleaned_properties.csv"
property_df.to_csv(output_path, index=False)

## 1.2 Other Data

### Load Data

In [8]:
hospital_df = pd.read_csv("../data/landing/hospital_health_services_addresses.csv")
park_df = pd.read_csv("../data/landing/park.csv")
station_df = pd.read_csv("../data/landing/stations_and_suburbs.csv")
shopping_cen_df = pd.read_csv("../data/landing/victoria_shopping_centres.csv")

### Required Functions

In [9]:
def get_postcode(address):
    match = re.search(r'\b(\d{4})\b$', address)
    if match:
        return match.group(1)
    else:
        return None

In [10]:
def indicate_melbourne_suburbs(suburb):
    melbourne_suburbs = ['flagstaff', 'parliament', 'melbourne central', 'flinders street', 'southern cross']
    
    if pd.isna(suburb):
        return suburb
    
    suburb_lower = suburb.lower().strip()
    
    if suburb_lower in melbourne_suburbs:
        return 'Melbourne'
    else:
        return suburb

In [11]:
def get_suburb(region, address):
    indy_suburbs = ['southbank', 'docklands']
    
    if region == 'Melbourne CBD':
        suburb = address.split(',')[-1].strip()
        suburb_lower = suburb.lower().strip()
        if suburb_lower in indy_suburbs:
            return suburb
        else:
            return 'Melbourne'
    
    elif region == 'Geelong':
        if ',' in address:
            suburb = address.split(',')[-1].strip()
            return suburb
        else:
            return 'Geelong'
    
    else:
        suburb = address.split(',')[-1].strip()
        if '(' in suburb:
            return suburb.split('(')[0].strip()
        return suburb

### Apply

Hospital

In [12]:
hospital_df['postalCode'] = hospital_df['Address'].apply(get_postcode)
hospital_df = hospital_df.rename(columns={'Address': 'hospital_address'})

In [13]:
output_path = "../data/raw/cleaned_hospital_health_services_addresses.csv"
hospital_df.to_csv(output_path, index=False)

Park

In [14]:
park_df['postalCode'] = park_df['Address'].apply(get_postcode)
park_df = park_df.rename(columns={'Address': 'park_address'})
park_df = park_df.drop('ID', axis=1)

In [15]:
output_path = "../data/raw/cleaned_park.csv"
park_df.to_csv(output_path, index=False)

Station

In [16]:
station_df['addressLocality'] = station_df['Station'].apply(indicate_melbourne_suburbs)
station_df = station_df.drop('Station', axis = 1)
station_df = station_df.rename(columns={'Suburb': 'Station'})

In [17]:
output_path = "../data/raw/cleaned_stations_and_suburbs.csv"
station_df.to_csv(output_path, index=False)

Shopping Center

In [18]:
shopping_cen_df['addressLocality'] = shopping_cen_df.apply(lambda row: get_suburb(row['Region'], 
                                                                                 row['Shopping Centre']), axis=1)
shopping_cen_df = shopping_cen_df.drop('Region', axis = 1)

In [19]:
output_path = "../data/raw/cleaned_victoria_shopping_centres.csv"
shopping_cen_df.to_csv(output_path, index=False)

## 2. Data Integration

In [20]:
final_df = pd.merge(property_df, hospital_df, on='postalCode', how='left')
final_df = pd.merge(final_df, park_df, on='postalCode', how='left')
final_df = pd.merge(final_df, station_df, on='addressLocality', how='left')
final_df = pd.merge(final_df, shopping_cen_df, on='addressLocality', how='left')

In [21]:
output_path = "../data/curated/combined_data.csv"
final_df.to_csv(output_path, index=False)