In [8]:
import pandas as pd

## A. Pre-process on Price column

In [9]:
df = pd.read_csv("../data/curated/domain_del.csv", encoding='latin1')

In [10]:
df.columns

Index(['id', 'promoType', 'price', 'hasVideo', 'agentNames', 'brandName',
       'addressStreet', 'addressSuburb', 'addressState', 'addressPostcode',
       'addressLat', 'addressLng', 'num_bath', 'type', 'formatted', 'isRural',
       'landSize', 'Retirement'],
      dtype='object')

In [11]:
# 清理之前的dataset numbers
price_columns_before = df['id'].count()

# inspect the price columns
df['price']

0       $500 per week
1       $540 per Week
2             $650.00
3       $560 per week
4                $600
            ...      
4909    $470 per week
4910             $600
4911             $575
4912    $530 per week
4913             $530
Name: price, Length: 4914, dtype: object

## Step 1 : Extract the price by Regax

In [12]:
# 过滤掉只含有文字列 提取数字
df['price'] = df['price'].str.extract(r'(\d+)')

# transfer to float
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Homes Victoria rents for the third quarter of 2023 to filter the records not in $170 ~ $2400,
# and delete missing values caused by string values (e.g. Inspection only) or missed records.
df = df[(df['price'] >= 170) & (df['price'] <= 2400)]

In [13]:
df['price']

0       500.0
1       540.0
2       650.0
3       560.0
4       600.0
        ...  
4909    470.0
4910    600.0
4911    575.0
4912    530.0
4913    530.0
Name: price, Length: 4402, dtype: float64

In [14]:
# whole dataset instances
print(f"Before cleaning on price columns, there are {price_columns_before} records in the main dataset.")
print(f"After cleaning on price columns, there are {df['id'].count()} records in the main dataset.")

Before cleaning on price columns, there are 4914 records in the main dataset.
After cleaning on price columns, there are 4402 records in the main dataset.


## Save the dataset

In [15]:
df.to_csv('../data/curated/Price_clean_domain.csv', index=False)