In [1]:
import pandas as pd
import re
import ast

### Load Data

In [54]:
property_df = pd.read_csv("../data/landing/properties.csv")

### Basic Information Of Data

In [60]:
for a_feature in list(property_df.columns):
    na_count = property_df[a_feature].isna().sum()
    print(f"Number of NA in {a_feature}: {na_count}")

Number of NA in streetAddress: 0
Number of NA in price: 0
Number of NA in bedrooms: 0
Number of NA in bathrooms: 0
Number of NA in parking: 0
Number of NA in addressLocality: 0
Number of NA in addressRegion: 0
Number of NA in postalCode: 0
Number of NA in property_type: 0
Number of NA in nearby_schools: 0
Number of NA in nbn_type: 1389
Number of NA in age_distribution: 0
Number of NA in geo: 4278


### Required Functions

In [59]:
def clean_price(price):
    if pd.isna(price) or price == '':
        return None
    
    # 使用改进的正则表达式
    match = re.search(r'(\d+(?:,\d+)*(?:\.\d+)?)\s*(pw|pcm|per week|per month)?', str(price), re.IGNORECASE)
    
    if match:
        # 移除非数字字符并转换为浮点数
        clean_price_str = re.sub(r'[^\d.]', '', match.group(1))
        
        try:
            price_value = float(clean_price_str)
        except ValueError:
            return None

        # 如果是每月价格，转换为每周价格
        if match.group(2) and ('m' in match.group(2).lower() or 'month' in match.group(2).lower()):
            price_value /= 4.3  # 假设一个月平均4.3周

        return round(price_value, 2)
    else:
        return None

def parse_location(location_str):
    if pd.isna(location_str):
        return pd.Series({'streetAddress': None, 'addressLocality': None, 'addressRegion': None, 'postalCode': None})
    location_dict = ast.literal_eval(location_str)
    return pd.Series(location_dict)

def parse_age_dist(age_dist_str):
    if pd.isna(age_dist_str):
        return None
    age_dist_dict = ast.literal_eval(age_dist_str)
    for key, value in age_dist_dict.items():
        value = value.strip().rstrip('%')
        value = float(value)
        if value >= 50:
            return key
    
    return 'balanced age' 

In [53]:
def clean_properties(property_df):
    property_df[['bedrooms', 'bathrooms', 'parking']] = property_df[['bedrooms', 'bathrooms', 'parking']].fillna(0)
    
    # price
    property_df['price'] = property_df['price'].apply(clean_price)

    # location
    location_df = property_df['location'].apply(parse_location)
    property_df = pd.concat([property_df.drop('location', axis=1), location_df], axis=1)
    
    # age distribution
    property_df['age_distribution'] = property_df['age_distribution'].apply(parse_age_dist)
    
    property_df = property_df.dropna(subset=['price', 'streetAddress', 'addressLocality', 'postalCode'])
    
    property_df = property_df[[
    'streetAddress', 'price', 'bedrooms', 'bathrooms', 'parking',
    'addressLocality', 'addressRegion', 'postalCode', 'property_type', 
    'nearby_schools', 'nbn_type', 'age_distribution', 'geo'
    ]]
    
    return property_df
    

### Apply Cleaning and Feature Engineering

In [56]:
property_df = clean_properties(property_df)

In [61]:
property_df.count()

streetAddress       6606
price               6606
bedrooms            6606
bathrooms           6606
parking             6606
addressLocality     6606
addressRegion       6606
postalCode          6606
property_type       6606
nearby_schools      6606
nbn_type            5217
age_distribution    6606
geo                 2328
dtype: int64

In [62]:
output_path = "../data/raw/cleaned_properties.csv"
property_df.to_csv(output_path, index=False)