In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
import os
import json
from fuzzywuzzy import process

# Read Files

In [2]:
# Base directory for JSON files 
landing_base_directory = "../../data/landing/domain_data"

# if you want to use the newest domain data to proceed, uncomment the line below and comment the above line.
# landing_base_directory = "../../data/landing/domain_data_new"
# Remember: using newest domain data will have different result from what we had in presentation and summary notebook!

# This function reads a JSON file from the provided file name and returns the data.
def read_json_file(file_name):
    # Construct the full file path using the base directory
    file_path = os.path.join(landing_base_directory, file_name)
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    return data

In [3]:
# This block checks if a folder exists at the specified path.

folder_path = '../../data/raw/domain_data'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created successfully.")
else:
    print(f"Folder '{folder_path}' already exists.")

Folder '../../data/raw/domain_data' created successfully.


In [4]:
house = read_json_file("house.json")
apartment = read_json_file("apartment.json")
town_house = read_json_file("town_house.json")


## Files

In [5]:
criminal_file = '../../data/landing/other_data/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2024.xlsx'
criminal_data = pd.read_excel(criminal_file, sheet_name='Table 03')

In [6]:
data_by_region = pd.read_csv('../../data/landing/region_data/key_statistics/all_region_key_data.csv')
# if you want to use the newest ABS region data to proceed, uncomment the line below and comment the above line.
# data_by_region = pd.read_csv('../../data/landing/region_data/key_statistics/all_region_key_data_new.csv')
# Remember: using newest ABS data will have different result from what we had in presentation and summary notebook!

# Preprocessing

### JSON to CSV

In [7]:
# This function converts data from JSON format into a flattened CSV structure. 
def convert_json_to_csv(data, json_file):
    flat_data = []
    directory = '../../data/raw/domain_data'

    # It loops through each property in the JSON, extracts key details 
    for property_url, details in data.items():

        bed_info = next((room for room in details['rooms'] if 'Bed' in room), 'N/A')
        bath_info = next((room for room in details['rooms'] if 'Bath' in room), 'N/A')

        flat_data.append({
            'property_url': property_url,
            'name': details.get('name', 'N/A'),
            'property_type': details.get('property_type', 'N/A'),
            'cost_text': details.get('cost_text', 'N/A'),
            'latitude': details.get('latitude', 'N/A'),
            'longitude': details.get('longitude', 'N/A'),
            'bed_info': bed_info,
            'bath_info': bath_info,
            'parking': details.get('parking', 'N/A'),
            'date_available': details.get('date_available', 'N/A'),
            'desc': details.get('desc', '').strip('</')  
        })

    # saved as a CSV file
    property_df = pd.DataFrame(flat_data)
    file_path = os.path.join(directory, json_file)
    property_df.to_csv(file_path, index=False)
    print(f"Data successfully converted to CSV and saved at {file_path}")

    return property_df

In [8]:
house_df = convert_json_to_csv(house, "house.csv")
apartment_df = convert_json_to_csv(apartment, "apartment.csv")
town_house_df = convert_json_to_csv(town_house, "town_house.csv")

Data successfully converted to CSV and saved at ../../data/raw/domain_data/house.csv
Data successfully converted to CSV and saved at ../../data/raw/domain_data/apartment.csv
Data successfully converted to CSV and saved at ../../data/raw/domain_data/town_house.csv


In [9]:
# This block merges the DataFrames for town houses, houses, and apartments into a single DataFrame 
merged_data = pd.concat([town_house_df, house_df, apartment_df], ignore_index=True)
merged_data.to_csv('../../data/raw/domain_data/properties_data1.csv', index=False)

In [10]:
# Get post code
merged_data['post_code'] = merged_data['name'].str.extract(r'(\d{4})$')

# Get region 
merged_data['region'] = merged_data['name'].str.extract(r',\s*([^,0-9]+)\s+\d{4}$')
merged_data['region'] = merged_data['region'].str.replace(r'\s*VIC\s*$', '', regex=True)

# Get numbers from the bed and bath columns
merged_data['bed_info'] = merged_data['bed_info'].str.extract(r'(\d+)').astype(int)
merged_data['bath_info'] = merged_data['bath_info'].str.extract(r'(\d+)').astype(int)

# Get the number from the parking column and replace N/A with 0
merged_data['parking'] = merged_data['parking'].str.extract(r'(\d+)')
merged_data['parking'] = merged_data['parking'].fillna(0).astype(int)

# Get the numbers in the rent, remove non-numeric characters like "$" and "weekly", and convert them to floating-point numbers
merged_data['cost_text'] = merged_data['cost_text'].str.extract(r'([\d,\.]+)').replace(',', '', regex=True).astype(float)

# Take the data within the reasonable rent range
merged_data = merged_data[merged_data['cost_text'].between(10, 5000)]

merged_data = merged_data.dropna()


### Save files

In [11]:
merged_data.to_csv('../../data/raw/domain_data/properties_data2.csv', index=False)

## Proprocessing ABS Data

In [12]:
# Remove the '-' symbol from the Region column
data_by_region['Region'] = data_by_region['Region'].str.replace('-', '', regex=False)
data_by_region['Region'] = data_by_region['Region'].str.replace('(', '', regex=False)
data_by_region['Region'] = data_by_region['Region'].str.replace(')', '', regex=False)


In [13]:
# Keep columns that are helpful in predicting house prices
columns_to_keep = [
    'Region',
    'Children enrolled in a preschool or preschool program (no.)',
    'Estimated resident population (no.)',
    'Land area (ha)',
    'Median monthly household mortgage payment ($)',
    'Median price of established house transfers ($)',
    'Median total income (excl. Government pensions and allowances) ($)',
    'Median weekly household rental payment ($)',
    'Number of jobs',
    'Working age population (aged 15-64 years) (%)'
]


In [14]:
# Remove non-numeric values
# Converts the strings to numeric values
# NaN values are then filled with the column's median
data_by_region = data_by_region[columns_to_keep]

for col in data_by_region.columns:
    if col != 'Region':
        data_by_region[col] = data_by_region[col].str.split('/').str[-1].str.strip()

        data_by_region[col] = data_by_region[col].str.replace(' ', '')
    
        data_by_region[col] = pd.to_numeric(data_by_region[col], errors='coerce')
        data_by_region[col].fillna(data_by_region[col].median(), inplace=True)


In [15]:
merged_data_cleaned = merged_data.copy()

In [16]:
data_by_region_unique_regions = set(data_by_region['Region'].unique())
merged_data_cleaned_unique_regions = set(merged_data_cleaned['region'].unique())

# Find the difference
diff_in_data_by_region = data_by_region_unique_regions - merged_data_cleaned_unique_regions
diff_in_merged_data_cleaned = merged_data_cleaned_unique_regions - data_by_region_unique_regions

print(f"Regions that is present in data_by_region but not in merged_data_cleaned: {diff_in_data_by_region}")
print(f"Region that is cleaned in merged_data_cleaned but not in data_by_region: {diff_in_merged_data_cleaned}")

Regions that is present in data_by_region but not in merged_data_cleaned: {'Wilsons Promontory', 'Canadian  Mount Clear', 'Essendon Airport', 'Pakenham  North West', 'Endeavour Hills  North', 'Hampton Park  East', 'Mildura  South', 'Newtown Vic.', 'Moira', 'Mount Baw Baw Region', 'Malvern  Glen Iris', 'Longford  Loch Sport', 'Rochester', 'Koo Wee Rup', 'Orbost', 'Echuca', 'Phillip Island', 'Foster', 'Alps  West', 'Merbein', 'Woodend', 'Cranbourne North  West', 'Clyde North  North', 'Rushworth', 'Doreen  South', 'Moorabbin Airport', 'Kew  West', 'Healesville  Yarra Glen', 'Truganina  South West', 'Norlane', 'Wandin  Seville', 'Bright  Mount Beauty', 'Warrnambool  North', 'Yarriambiack', 'Craigieburn  Central', 'Richmond South  Cremorne', 'Creswick  Clunes', 'Traralgon  East', 'Lalor  West', 'Kinglake', 'Horsham', 'Narre Warren North', 'Ormond  Glen Huntly', 'Robinvale', 'Montrose', 'St Kilda  West', 'Glen Waverley  East', 'Otway', 'Hurstbridge', 'Berwick  North', 'Mount Waverley  North'

In [17]:
# This function finds the closest matching region name from a list of cleaned region names.
def get_closest_match(region, cleaned_regions):
    match, score = process.extractOne(region, cleaned_regions)
    if score > 80:  
        return match
    else:
        return None


In [18]:
# This block applies the 'get_closest_match' function to the 'Region' column in 'data_by_region' 
data_by_region['matched_region'] = data_by_region['Region'].apply(lambda x: get_closest_match(x, merged_data_cleaned['region'].unique()))

unmatched_data = data_by_region[data_by_region['matched_region'].isna()]
print(f"unmatched region: {unmatched_data['Region'].unique()}")

merged_final_geo = pd.merge(data_by_region, merged_data_cleaned, left_on='matched_region', right_on='region', how='left')
merged_final_geo.drop(columns=['matched_region'], inplace=True)


unmatched region: ['Smythes Creek' 'Creswick  Clunes' 'Daylesford' 'Gordon Vic.' 'Avoca'
 'Beaufort' 'Maryborough Vic.' 'Maryborough Surrounds' 'Maiden Gully'
 'Strathfieldsaye' 'Castlemaine' 'Castlemaine Surrounds' 'Heathcote'
 'Kyneton' 'Woodend' 'Loddon' 'Bannockburn' 'Charlemont' 'Norlane'
 'Clifton Springs' 'Lorne  Anglesea' 'Alexandra' 'Euroa'
 'Kilmore  Broadford' 'Mansfield Vic.' 'Nagambie' 'Seymour'
 'Seymour Surrounds' 'Yea' 'Benalla' 'Benalla Surrounds' 'Rutherglen'
 'Wangaratta' 'Wangaratta Surrounds' 'Towong' 'Yackandandah'
 'Trafalgar Vic.' 'Bairnsdale' 'Bruthen  Omeo' 'Orbost' 'Paynesville'
 'Foster' 'French Island' 'Korumburra' 'Leongatha' 'Phillip Island'
 'Wilsons Promontory' 'Morwell' 'Longford  Loch Sport' 'Maffra' 'Sale'
 'Braeside' 'Viewbank  Yallambie' 'Kingsbury' 'Hurstbridge' 'Kinglake'
 'Plenty  Yarrambat' 'Macedon' 'Romsey' 'Gowanbrae' 'Lysterfield'
 'The Basin' 'Belgrave  Selby' 'Monbulk  Silvan' 'Montrose'
 'Upwey  Tecoma' 'Wandin  Seville' 'Emerald  Cockat

In [19]:
merged_data_cleaned['region'] = merged_data_cleaned['region'].astype(str)

In [20]:
# Adds a new column for merged_data_cleaned, storing the fuzzy matched region name
merged_data_cleaned['matched_region'] = merged_data_cleaned['region'].apply(lambda x: get_closest_match(x, data_by_region['Region'].unique()))

# View data that is not successfully matched
unmatched_data = merged_data_cleaned[merged_data_cleaned['matched_region'].isna()]
print(f"The areas that not successfully matched: {unmatched_data['region'].unique()}")

# Merge two data sets using fuzzy matched region names
merged_final = pd.merge(merged_data_cleaned, data_by_region, left_on='matched_region', right_on='Region', how='left')

The areas that not successfully matched: ['Tallangatta' 'Aintree' 'Spotswood' 'Maidstone' 'Macleod' 'Baxter'
 'Brooklyn' 'Rippleside' 'Williams Landing' 'Whittington' 'Deanside'
 'Marshall' 'Beveridge' 'Safety Beach' 'Darley' 'Tyabb' 'Kalkallo'
 'Newington' 'Donnybrook' 'Heathmont' 'Tootgarook' 'Balaclava' 'Curlewis'
 'Mambourin' 'Portsea' 'Shoreham' 'Rye' 'Burnley' 'Merricks Beach'
 'Kooyong' 'Blairgowrie' 'Cobden' 'Elliminyt' 'Bonnie Brook'
 'Nar Nar Goon' 'Harkness' 'Ovens' 'Peterborough' 'Longlea' 'Albanvale'
 'Timboon' 'Bonshaw' 'Sorrento' 'Lucas' 'Ripponlea' 'Gardenvale'
 'Lower Plenty' 'Deepdene']


### Save files

In [21]:
merged_final_geo.to_csv('../../data/raw/domain_data/merged_final_geo.csv', index=False)

## Preprocessing Criminal Data

In [22]:
# Keep column we need
columns_to_keep = ['Year', 'Year ending', 'Local Government Area', 'Postcode', 'Suburb/Town Name', 'Offence Division']
criminal_data = criminal_data[columns_to_keep]

In [23]:
crime_filtered = criminal_data[criminal_data['Year'].between(2020, 2024)]

# Calculate the annual number of crimes based on Postcode
crime_avg_by_postcode = crime_filtered.groupby('Postcode').size().div(5).reset_index(name='avg_crime_count')

merged_final['post_code'] = merged_final['post_code'].astype(str)
crime_avg_by_postcode['Postcode'] = crime_avg_by_postcode['Postcode'].astype(str)

# Merge the average annual crime count into merged_data_cleaned
merged_final = pd.merge(merged_final, crime_avg_by_postcode, left_on='post_code', right_on='Postcode', how='left')

# Delete redundant Postcode columns
merged_final.drop('Postcode', axis=1, inplace=True)

In [24]:
merged_final.shape

(2958, 26)

### Save Files

In [25]:
# Define the folder path
folder_path = '../../data/raw/other_data'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the CSV file to the newly created folder
criminal_data.to_csv(f'{folder_path}/criminal_data.csv', index=False)

## Data Type

In [26]:
type_counts = merged_final.dtypes.value_counts()
print("Data types and their column counts:")
print(type_counts)


Data types and their column counts:
float64    13
object     10
int64       3
dtype: int64


In [27]:
columns_to_modify = ['property_type', 'Region']
category_counts = merged_final[columns_to_modify].nunique()
print("Number of unique categories for each column in columns_to_modify:")
print(category_counts)

Number of unique categories for each column in columns_to_modify:
property_type      8
Region           291
dtype: int64


In [28]:
# label encoding for region
label_encoder = LabelEncoder()
merged_final['region_encoded'] = label_encoder.fit_transform(merged_final['Region'])
merged_final = merged_final.drop(columns=["Region"])

In [29]:
# one hot encoding for property_type
merged_final = pd.get_dummies(merged_final, columns=['property_type'], prefix='property_type', drop_first=True)
property_type_columns = [col for col in merged_final.columns if 'property_type_' in col]
merged_final[property_type_columns] = merged_final[property_type_columns].astype(int)

In [30]:
# Alternative approach 
# diff_property_url_in_merged = ~merged_final['property_url'].isin(property_df0['property_url'])
# diff_index_in_merged = merged_final[diff_property_url_in_merged].index
# merged_final_test_cleaned = merged_final.drop(diff_index_in_merged)
# merged_final_test_cleaned_sorted = merged_final_test_cleaned.set_index('property_url').reindex(property_df0['property_url']).reset_index()

In [31]:
merged_final = merged_final.dropna()

In [32]:
merged_final.shape

(2843, 32)

### Save files

In [33]:
merged_final.to_csv('../../data/curated/properties_data3.csv', index=False)