In [None]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import tracemalloc
import os

# 📌 Start tracking memory
tracemalloc.start()

# 📌 Start measuring time
start_time = time.time()

# 📌 Record initial CPU times
cpu_start = psutil.cpu_times()
process = psutil.Process(os.getpid())
mem_start = process.memory_info().rss / (1024 * 1024)  # in MB

# ---------------------- Clean Data ----------------------

df = pd.read_csv('iproperty_listings_full.csv')

# Clean Property Title
df['Property Title'] = df['Property Title'].str.split(',').str[0]
df['Property Title'] = df['Property Title'].str.title()

# Clean Property Price
df['Property Price'] = df['Property Price'].str.replace('RM', '', regex=False)
df['Property Price'] = df['Property Price'].str.replace(',', '', regex=False)
df['Property Price'] = pd.to_numeric(df['Property Price'], errors='coerce')
df.rename(columns={'Property Price': 'Property Price (RM)'}, inplace=True)

# Clean Property Location
split_cols = df['Property Location'].str.split(', ', expand=True)
split_cols.columns = ['Property Location (City)', 'Property Location (State)']
df.drop(columns=['Property Location'], inplace=True)
df.insert(2, 'Property Location (City)', split_cols['Property Location (City)'])
df.insert(3, 'Property Location (State)', split_cols['Property Location (State)'])

def parse_property_details(detail):
    if pd.isnull(detail):
        return pd.Series([None, None, None])

    # clean messy code（eg: Â â€¢Â ）
    clean_detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))

    # extract Type（all character before Built-up）
    type_match = re.search(r"^(.*?)Built-up", clean_detail, re.IGNORECASE)
    property_type = type_match.group(1).strip().rstrip('|') if type_match else None

    # extract Area
    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', clean_detail, re.IGNORECASE)
    area = area_match.group(1).replace(',', '') if area_match else None

    # extract Furnishing status
    furnishing = None
    if re.search(r'\bUnfurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Furnished'

    return pd.Series([
        property_type.strip() if property_type else None,
        float(area) if area else None,
        furnishing
    ])


df[['Property Type', 'Property Size (sqft)', 'Property Furnishing Status']] = df['Property Details'].apply(parse_property_details)

# delete Property Details
df = df.drop(columns=['Property Details'])

# Clean Property Agent
df['Property Agent'] = df['Property Agent'].str.title()
df['Property Agent'] = df['Property Agent'].apply(
    lambda x: None if re.search(r'\bsdn\.?\s*bhd\.?\b', str(x), re.IGNORECASE) else x
)

# Drop NaNs in price
df.dropna(subset=['Property Price (RM)'], inplace=True)

# Print sample result
print(df.head())

# ---------------------- END: Performance Tracking ----------------------

# Time & Memory
end_time = time.time()
elapsed_time = end_time - start_time

# CPU & Memory
cpu_end = psutil.cpu_times()
mem_end = process.memory_info().rss / (1024 * 1024)  # in MB
peak_mem = tracemalloc.get_traced_memory()[1] / (1024 * 1024)  # Peak memory usage in MB
tracemalloc.stop()

# Records
total_records = len(df)
throughput = total_records / elapsed_time if elapsed_time > 0 else 0

# ---------------------- OUTPUT ----------------------

print("\n📊 Performance Summary")
print(f"➡️ Total Records Processed: {total_records}")
print(f"⏱ Total Processing Time: {elapsed_time:.2f} seconds")
print(f"⚙️ Memory Used (Before → After): {mem_start:.2f} MB → {mem_end:.2f} MB")
print(f"🔺 Peak Memory Usage: {peak_mem:.2f} MB")
print(f"📈 Throughput: {throughput:.2f} records/second")


           Area                                 Property Title  \
0  perlis-zop7y  Semi D 2 Tingkat - Taman Jaya Diri - Seriab -   
1  perlis-zop7y    Semi D 1 Tingkat - Taman Nyu Indah 2 - Arau   
2  perlis-zop7y                           Taman Seri Manis Dua   
3  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   
4  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   

  Property Location (City) Property Location (State)  Property Price (RM)  \
0                   Kangar                    Perlis             775776.0   
1                     Arau                    Perlis             398000.0   
2                   Kangar                    Perlis             306000.0   
3                     Arau                    Perlis             185000.0   
4                     Arau                    Perlis             210000.0   

  Property Agent                                         Source URL  \
0         Haneef  https://www.iproperty.com.my/sale/perlis-zop7y...  

In [None]:
df.to_csv('iproperty_listings_cleaned.csv', index=False)

In [None]:
!pip install -U modin[ray] ray

Collecting ray
  Downloading ray-2.46.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting modin[ray]
  Downloading modin-0.32.0-py3-none-any.whl.metadata (17 kB)
Downloading ray-2.46.0-cp311-cp311-manylinux2014_x86_64.whl (68.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.5/68.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading modin-0.32.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: modin, ray
Successfully installed modin-0.32.0 ray-2.46.0


In [None]:
import modin.pandas as pd
import pandas
import numpy as np
import re
import time
import psutil
import tracemalloc
import os
import ray

ray.init(ignore_reinit_error=True)
os.environ["MODIN_ENGINE"] = "ray"

# 📌 Start tracking memory
tracemalloc.start()

# 📌 Start measuring time
start_time = time.time()

# 📌 Record initial CPU times
cpu_start = psutil.cpu_times()
process = psutil.Process(os.getpid())
mem_start = process.memory_info().rss / (1024 * 1024)  # in MB

# ---------------------- Clean Data ----------------------

df = pd.read_csv('iproperty_listings_full.csv')

# Rename Area to Area Code
df.rename(columns={'Area': 'Area Code'}, inplace=True)

# Clean Property Title
df['Property Title'] = df['Property Title'].str.split(',').str[0]
df['Property Title'] = df['Property Title'].str.title()

# Clean Property Price
df['Property Price'] = df['Property Price'].str.replace('RM', '', regex=False)
df['Property Price'] = df['Property Price'].str.replace(',', '', regex=False)
df['Property Price'] = pd.to_numeric(df['Property Price'], errors='coerce')
df.rename(columns={'Property Price': 'Property Price (RM)'}, inplace=True)

# Clean Property Location
split_cols = df['Property Location'].str.split(', ', expand=True)
split_cols.columns = ['Property Location (City)', 'Property Location (State)']
df.drop(columns=['Property Location'], inplace=True)
df.insert(2, 'Property Location (City)', split_cols['Property Location (City)'])
df.insert(3, 'Property Location (State)', split_cols['Property Location (State)'])

def parse_property_details(detail):
    if pd.isnull(detail):
        return pd.Series([None, None, None])

    # clean messy code（eg: Â â€¢Â ）
    clean_detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))

    # extract Type（all character before Built-up）
    type_match = re.search(r"^(.*?)Built-up", clean_detail, re.IGNORECASE)
    if type_match:
        raw_type = type_match.group(1).strip()
        property_type = re.split(r'\s*\|\s*', raw_type)[0]
    else:
        property_type = None

    # extract Area
    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', clean_detail, re.IGNORECASE)
    area = area_match.group(1).replace(',', '') if area_match else None

    # extract Furnishing status
    furnishing = "Unknown"
    if re.search(r'\bUnfurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Furnished'

    return (
        property_type.strip() if property_type else None,
        float(area) if area else None,
        furnishing
    )

# FALLBACK: Use pandas just for this one complex apply
# Step 1: Convert just this column to pandas
property_details_pandas = df['Property Details']._to_pandas()

# Step 2: Apply the parsing function
parsed_details = property_details_pandas.apply(parse_property_details)
parsed_details = pd.DataFrame(parsed_details.tolist(), columns=['Property Type', 'Property Size (sqft)', 'Property Furnishing Status'])

# Step 3: Assign back to the Modin DataFrame
df[['Property Type', 'Property Size (sqft)', 'Property Furnishing Status']] = parsed_details

# Delete Property Details
df = df.drop(columns=['Property Details'])

# Clean Property Agent
df['Property Agent'] = df['Property Agent'].str.title()
df['Property Agent'] = df['Property Agent'].apply(
    lambda x: None if re.search(r'\bsdn\.?\s*bhd\.?\b', str(x), re.IGNORECASE) else x
)

# Drop NaNs in price and Null in property type
df.dropna(subset=['Property Price (RM)'], inplace=True)
df.dropna(subset=['Property Type'], inplace=True)

# Drop illogical property size (lower than 70 sqft)
df = df[df['Property Size (sqft)'] >= 70]

# Drop duplicates
df.drop_duplicates(inplace=True)

# Print sample result
print(df.head())

# ---------------------- END: Performance Tracking ----------------------

# Time & Memory
end_time = time.time()
elapsed_time = end_time - start_time

# CPU & Memory
cpu_end = psutil.cpu_times()
mem_end = process.memory_info().rss / (1024 * 1024)  # in MB
peak_mem = tracemalloc.get_traced_memory()[1] / (1024 * 1024)  # Peak memory usage in MB
tracemalloc.stop()

# Records
total_records = len(df)
throughput = total_records / elapsed_time if elapsed_time > 0 else 0

# ---------------------- OUTPUT ----------------------

print("\n📊 Performance Summary")
print(f"➡️ Total Records Processed: {total_records}")
print(f"⏱ Total Processing Time: {elapsed_time:.2f} seconds")
print(f"⚙️ Memory Used (Before → After): {mem_start:.2f} MB → {mem_end:.2f} MB")
print(f"🔺 Peak Memory Usage: {peak_mem:.2f} MB")
print(f"📈 Throughput: {throughput:.2f} records/second")


2025-05-08 10:31:54,650	INFO worker.py:1888 -- Started a local Ray instance.
Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


      Area Code                                 Property Title  \
0  perlis-zop7y  Semi D 2 Tingkat - Taman Jaya Diri - Seriab -   
1  perlis-zop7y    Semi D 1 Tingkat - Taman Nyu Indah 2 - Arau   
2  perlis-zop7y                           Taman Seri Manis Dua   
3  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   
4  perlis-zop7y   Teres 1 Tingkat - Bandar Baharu Putra Height   

  Property Location (City) Property Location (State)  Property Price (RM)  \
0                   Kangar                    Perlis             775776.0   
1                     Arau                    Perlis             398000.0   
2                   Kangar                    Perlis             306000.0   
3                     Arau                    Perlis             185000.0   
4                     Arau                    Perlis             210000.0   

  Property Agent                                         Source URL  \
0         Haneef  https://www.iproperty.com.my/sale/perlis-zop7y...  

In [None]:
import modin.config as modin_cfg
print(modin_cfg.Engine.get())

Ray
