In [None]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import tracemalloc
import os

# 📌 Start tracking memory
tracemalloc.start()

# 📌 Start measuring time
start_time = time.time()

# 📌 Record initial CPU times
cpu_start = psutil.cpu_times()
process = psutil.Process(os.getpid())
mem_start = process.memory_info().rss / (1024 * 1024)  # in MB

# ---------------------- Clean Data ----------------------

df = pd.read_csv('new_iproperty_dataset.csv')

# Clean Property Title
df['Property Title'] = df['Property Title'].str.split(',').str[0]
df['Property Title'] = df['Property Title'].str.title()

# Clean Property Price
df['Property Price'] = df['Property Price'].str.replace('RM', '', regex=False)
df['Property Price'] = df['Property Price'].str.replace(',', '', regex=False)
df['Property Price'] = pd.to_numeric(df['Property Price'], errors='coerce')
df.rename(columns={'Property Price': 'Property Price (RM)'}, inplace=True)

# Clean Property Location
split_cols = df['Property Location'].str.split(', ', expand=True)
split_cols.columns = ['Property Location (City)', 'Property Location (State)']
df.drop(columns=['Property Location'], inplace=True)
df.insert(2, 'Property Location (City)', split_cols['Property Location (City)'])
df.insert(3, 'Property Location (State)', split_cols['Property Location (State)'])

def parse_property_details(detail):
    if pd.isnull(detail):
        return pd.Series([None, None, None])

    # clean messy code（eg: Â â€¢Â ）
    clean_detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))

    # extract Type（all character before Built-up）
    type_match = re.search(r"^(.*?)Built-up", clean_detail, re.IGNORECASE)
    if type_match:
        raw_type = type_match.group(1).strip()
        property_type = re.split(r'\s*\|\s*', raw_type)[0]
    else:
        property_type = None

    # extract Area
    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', clean_detail, re.IGNORECASE)
    area = area_match.group(1).replace(',', '') if area_match else None

    # extract Furnishing status
    furnishing = "Unknown"
    if re.search(r'\bUnfurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Furnished'

    return pd.Series([
        property_type.strip() if property_type else None,
        float(area) if area else None,
        furnishing
    ])



df[['Property Type', 'Property Size (sqft)', 'Property Furnishing Status']] = df['Property Details'].apply(parse_property_details)

# delete Property Details
df = df.drop(columns=['Property Details'])

# Clean Property Agent
df['Property Agent'] = df['Property Agent'].str.title()
df['Property Agent'] = df['Property Agent'].apply(
    lambda x: None if re.search(r'\bsdn\.?\s*bhd\.?\b', str(x), re.IGNORECASE) else x
)

# Drop NaNs in price and Null in property type
df.dropna(subset=['Property Price (RM)'], inplace=True)
df.dropna(subset=['Property Type'], inplace=True)

# Drop illogical property size (lower than 70 sqft)
df = df[df['Property Size (sqft)'] >= 70]

# Drop duplicates
df.drop_duplicates(inplace=True)

# Print sample result
print(df.head())

# ---------------------- END: Performance Tracking ----------------------

# Time & Memory
end_time = time.time()
elapsed_time = end_time - start_time

# CPU & Memory
cpu_end = psutil.cpu_times()
mem_end = process.memory_info().rss / (1024 * 1024)  # in MB
peak_mem = tracemalloc.get_traced_memory()[1] / (1024 * 1024)  # Peak memory usage in MB
tracemalloc.stop()

# Records
total_records = len(df)
throughput = total_records / elapsed_time if elapsed_time > 0 else 0

# ---------------------- OUTPUT ----------------------

print("\n📊 Performance Summary")
print(f"➡️ Total Records Processed: {total_records}")
print(f"⏱ Total Processing Time: {elapsed_time:.2f} seconds")
print(f"⚙️ Memory Used (Before → After): {mem_start:.2f} MB → {mem_end:.2f} MB")
print(f"🔺 Peak Memory Usage: {peak_mem:.2f} MB")
print(f"📈 Throughput: {throughput:.2f} records/second")

           Area                                 Property Title  \
0  perlis-zop7y  Semi D 2 Tingkat - Taman Jaya Diri - Seriab -   
1  perlis-zop7y    Semi D 1 Tingkat - Taman Nyu Indah 2 - Arau   
2  perlis-zop7y                           Taman Seri Manis Dua   
3  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   
4  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   

  Property Location (City) Property Location (State)  Property Price (RM)  \
0                   Kangar                    Perlis             775776.0   
1                     Arau                    Perlis             398000.0   
2                   Kangar                    Perlis             306000.0   
3                     Arau                    Perlis             185000.0   
4                     Arau                    Perlis             210000.0   

  Property Agent                                         Source URL  \
0         Haneef  https://www.iproperty.com.my/sale/perlis-zop7y...  

In [None]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import tracemalloc
import os
import multiprocessing as mp


# parse details
def parse_property_details(detail):
    if pd.isnull(detail):
        return [None, None, "Unknown"]

    clean_detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))

    type_match = re.search(r"^(.*?)Built-up", clean_detail, re.IGNORECASE)
    if type_match:
        raw_type = type_match.group(1).strip()
        property_type = re.split(r'\s*\|\s*', raw_type)[0]
    else:
        property_type = None

    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', clean_detail, re.IGNORECASE)
    area = area_match.group(1).replace(',', '') if area_match else None

    furnishing = "Unknown"
    if re.search(r'\bUnfurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Furnished'

    return [
        property_type.strip() if property_type else None,
        float(area) if area else None,
        furnishing
    ]

#parallel parse function
def parallel_parse(series, func, processes=None):
    with mp.Pool(processes or mp.cpu_count()) as pool:
        results = pool.map(func, series)
    return pd.DataFrame(results, columns=['Property Type', 'Property Size (sqft)', 'Property Furnishing Status'])

# Main
if __name__ == "__main__":
    tracemalloc.start()
    start_time = time.time()
    cpu_start = psutil.cpu_times()
    process = psutil.Process(os.getpid())
    mem_start = process.memory_info().rss / (1024 * 1024)  # MB

    # Load CSV
    df = pd.read_csv('new_iproperty_dataset.csv')

    # Clean Title
    df['Property Title'] = df['Property Title'].str.split(',').str[0].str.title()

    # Clean Price
    df['Property Price'] = df['Property Price'].str.replace('RM', '', regex=False)
    df['Property Price'] = df['Property Price'].str.replace(',', '', regex=False)
    df['Property Price'] = pd.to_numeric(df['Property Price'], errors='coerce')
    df.rename(columns={'Property Price': 'Property Price (RM)'}, inplace=True)

    # Clean Location
    split_cols = df['Property Location'].str.split(', ', expand=True)
    split_cols.columns = ['Property Location (City)', 'Property Location (State)']
    df.drop(columns=['Property Location'], inplace=True)
    df.insert(2, 'Property Location (City)', split_cols['Property Location (City)'])
    df.insert(3, 'Property Location (State)', split_cols['Property Location (State)'])

    # Multiprocessing Parse
    parsed_df = parallel_parse(df['Property Details'], parse_property_details)
    df = pd.concat([df.drop(columns=['Property Details']), parsed_df], axis=1)

    # Clean Agent
    df['Property Agent'] = df['Property Agent'].str.title()
    df['Property Agent'] = df['Property Agent'].apply(
        lambda x: None if re.search(r'\bsdn\.?\s*bhd\.?\b', str(x), re.IGNORECASE) else x
    )

    # Drop NaNs in price and Null in property type
    df.dropna(subset=['Property Price (RM)'], inplace=True)
    df.dropna(subset=['Property Type'], inplace=True)

    # Drop illogical property size (lower than 70 sqft)
    df = df[df['Property Size (sqft)'] >= 70]

    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Output
    print(df.head())

    # Performance Tracking
    end_time = time.time()
    elapsed_time = end_time - start_time
    cpu_end = psutil.cpu_times()
    mem_end = process.memory_info().rss / (1024 * 1024)
    peak_mem = tracemalloc.get_traced_memory()[1] / (1024 * 1024)
    tracemalloc.stop()

    total_records = len(df)
    throughput = total_records / elapsed_time if elapsed_time > 0 else 0

    print("\n📊 Performance Summary")
    print(f"➡️ Total Records Processed: {total_records}")
    print(f"⏱ Total Processing Time: {elapsed_time:.2f} seconds")
    print(f"⚙️ Memory Used (Before → After): {mem_start:.2f} MB → {mem_end:.2f} MB")
    print(f"🔺 Peak Memory Usage: {peak_mem:.2f} MB")
    print(f"📈 Throughput: {throughput:.2f} records/second")


           Area                                 Property Title  \
0  perlis-zop7y  Semi D 2 Tingkat - Taman Jaya Diri - Seriab -   
1  perlis-zop7y    Semi D 1 Tingkat - Taman Nyu Indah 2 - Arau   
2  perlis-zop7y                           Taman Seri Manis Dua   
3  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   
4  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   

  Property Location (City) Property Location (State)  Property Price (RM)  \
0                   Kangar                    Perlis             775776.0   
1                     Arau                    Perlis             398000.0   
2                   Kangar                    Perlis             306000.0   
3                     Arau                    Perlis             185000.0   
4                     Arau                    Perlis             210000.0   

  Property Agent                                         Source URL  \
0         Haneef  https://www.iproperty.com.my/sale/perlis-zop7y...  

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import re
import time
import psutil
import tracemalloc
import os

# Start Spark session
spark = SparkSession.builder \
    .appName("iProperty Data Cleaning") \
    .getOrCreate()


tracemalloc.start()
start_time = time.time()
process = psutil.Process(os.getpid())
mem_start = process.memory_info().rss / (1024 * 1024)  # MB

# Read CSV
df = spark.read.csv("new_iproperty_dataset.csv", header=True, inferSchema=True)

# Clean Property Title
df = df.withColumn("Property Title", initcap(split(col("Property Title"), ",").getItem(0)))

# Clean Property Price
df = df.withColumn("Property Price (RM)",
    regexp_replace(regexp_replace(col("Property Price"), "RM", ""), ",", "").cast("double")
).drop("Property Price")

# Clean Property Location
df = df.withColumn("Property Location (City)", split(col("Property Location"), ", ").getItem(0)) \
       .withColumn("Property Location (State)", split(col("Property Location"), ", ").getItem(1)) \
       .drop("Property Location")

def parse_details(detail):
    if detail is None:
        return (None, None, "Unknown")

    # Clean messy unicode
    detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))

    # Extract Type
    type_match = re.search(r"^(.*?)Built-up", detail, re.IGNORECASE)
    if type_match:
        raw_type = type_match.group(1).strip()
        property_type = re.split(r'\s*\|\s*', raw_type)[0]
    else:
        property_type = None

    # Extract Area
    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', detail, re.IGNORECASE)
    area = area_match.group(1).replace(',', '') if area_match else None

    # Extract Furnishing
    furnishing = None
    if re.search(r'\bUnfurnished\b', detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', detail, re.IGNORECASE):
        furnishing = 'Furnished'

    return (property_type, float(area) if area else None, furnishing)

# Define schema for UDF output
schema = StructType([
    StructField("Property Type", StringType(), True),
    StructField("Property Size (sqft)", DoubleType(), True),
    StructField("Property Furnishing Status", StringType(), True)
])

# Register the UDF with the schema
parse_udf = udf(parse_details, schema)

# Apply UDF to parse the details
df = df.withColumn("parsed", parse_udf(col("Property Details")))

# Extract the parsed columns into separate columns
df = df.withColumn("Property Type", col("parsed.Property Type")) \
       .withColumn("Property Size (sqft)", col("parsed.Property Size (sqft)")) \
       .withColumn("Property Furnishing Status", col("parsed.Property Furnishing Status")) \
       .drop("Property Details", "parsed")

# Clean Property Agent
def clean_agent(agent):
    if agent is None:
        return None
    if re.search(r'\bsdn\.?\s*bhd\.?\b', agent, re.IGNORECASE):
        return None
    return agent.title()

# Register the UDF for cleaning agent names
clean_agent_udf = udf(clean_agent, StringType())
df = df.withColumn("Property Agent", clean_agent_udf(col("Property Agent")))

# Drop NaNs and Property Size <70
df = df.filter(col("Property Price (RM)").isNotNull())
df = df.filter(col("Property Type").isNotNull())
df = df.filter(col("Property Size (sqft)")>= 70)

# Drop duplication
df = df.dropDuplicates()


# Output
df.show(5, truncate=False)

# Performance Tracking
end_time = time.time()
elapsed_time = end_time - start_time
mem_end = process.memory_info().rss / (1024 * 1024)  # in MB
peak_mem = tracemalloc.get_traced_memory()[1] / (1024 * 1024)
tracemalloc.stop()

total_records = df.count()
throughput = total_records / elapsed_time if elapsed_time > 0 else 0

print("\n📊 Performance Summary")
print(f"➡️ Total Records Processed: {total_records}")
print(f"⏱ Total Processing Time: {elapsed_time:.2f} seconds")
print(f"⚙️ Memory Used (Before → After): {mem_start:.2f} MB → {mem_end:.2f} MB")
print(f"🔺 Peak Memory Usage: {peak_mem:.2f} MB")
print(f"📈 Throughput: {throughput:.2f} records/second")


+-----------+-------------------+--------------+--------------------------------------------------------------+-------------------+------------------------+-------------------------+------------------------+--------------------+--------------------------+
|Area       |Property Title     |Property Agent|Source URL                                                    |Property Price (RM)|Property Location (City)|Property Location (State)|Property Type           |Property Size (sqft)|Property Furnishing Status|
+-----------+-------------------+--------------+--------------------------------------------------------------+-------------------+------------------------+-------------------------+------------------------+--------------------+--------------------------+
|kedah-x5i6n|Taman Pulasan      |Patricia Lau  |https://www.iproperty.com.my/sale/kedah-x5i6n/all-residential/|899000.0           |Alor Setar              |Kedah                    |Semi-detached House     |1500.0              |Furn