In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
# dataset scraped at 20 Sept
# if you want to use newly scraped data, uncomment block below

file_path = "../../data/manual/all_properties_combined.csv"
domain_data = pd.read_csv(file_path)

In [None]:
# file_path = "../../data/raw/domain/all_properties_combined.csv"

In [3]:
domain_data.shape

(7459, 14)

In [4]:
# Remove dups
domain_data = domain_data.drop_duplicates()
domain_data.shape

(6639, 14)

In [5]:
missing_values = domain_data.isnull().sum()
print("number of missing value in each column：")
print(missing_values)

number of missing value in each column：
Address                           0
Cost                              0
Property Type                     0
Bedrooms                          0
Bathrooms                         0
Latitude                          0
Longitude                         0
Closest Gov Secondary School    860
Gov Secondary Distance          860
Age under 20                     78
Age 20-39                        78
Age 40-59                        78
Age 60+                          78
Postcode                          0
dtype: int64


No need to drop any, those missing are OK for some analysis.

In [6]:
# Using apply with a lambda function and regular expression to handle commas in prices
domain_data['Cost'] = domain_data['Cost'].apply(
    lambda x: float(re.search(r'\$(\d{1,3}(,\d{3})*(\.\d+)?)', x).group(1).replace(',', '')) 
    if pd.notnull(x) and re.search(r'\$(\d{1,3}(,\d{3})*(\.\d+)?)', x) else None
)

# Remove rows where 'Cost' is NaN (i.e., no price listed)
domain_data = domain_data.dropna(subset=['Cost'])

domain_data.describe()

Unnamed: 0,Cost,Bedrooms,Bathrooms,Latitude,Longitude,Postcode
count,6426.0,6426.0,6426.0,6426.0,6426.0,6426.0
mean,601.937923,2.52521,1.43713,-37.760485,144.901705,3198.325086
std,475.794794,0.947034,0.569128,0.802389,2.618571,224.367865
min,100.0,1.0,0.0,-38.477668,0.0,3000.0
25%,460.0,2.0,1.0,-37.893692,144.903884,3058.0
50%,550.0,3.0,1.0,-37.822914,144.999046,3141.0
75%,675.0,3.0,2.0,-37.764778,145.109044,3201.0
max,32328.0,4.0,4.0,0.0,147.65588,3977.0


In [9]:
domain_data.shape

(6426, 14)

In [7]:
# Remove price outlier
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = domain_data['Cost'].quantile(0.25)
Q3 = domain_data['Cost'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Get the number of records (N)
N = len(domain_data)

# Apply the formula for N > 100
multiplier = np.sqrt(np.log(N) - 0.5)

# Define the lower and upper bounds for outliers
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR

# Filter the DataFrame to exclude outliers
filtered_domain_data = domain_data[(domain_data['Cost'] >= lower_bound) & (domain_data['Cost'] <= upper_bound)]

In [8]:
filtered_domain_data.shape

(6283, 14)

In [12]:
filtered_domain_data['Postcode'].nunique()

205

In [10]:
filtered_domain_data.to_csv("../../data/raw/domain/all_properties_preprocessed.csv")