In [None]:
# High Performance Data Cleaning - JobLib

!pip install joblib

import time
import psutil
import os

# Performance tracking start
start_time = time.time()
process = psutil.Process(os.getpid())
cpu_start = psutil.cpu_percent(interval=None)
mem_before = process.memory_info().rss / (1024 * 1024)  # in MB

In [None]:
import sqlite3
import sqlite3
import pandas as pd
import polars as pl

raw_df = pl.read_csv("new_iproperty_dataset.csv")

raw_pd_df = raw_df.to_pandas()

conn = sqlite3.connect("iproperty.db")

raw_pd_df.to_sql("raw_data", conn, if_exists="replace", index=False)

# Now you can safely read
df = pd.read_sql_query("SELECT * FROM raw_data", conn)


In [None]:
import time
import psutil
import os
import sqlite3
import pandas as pd
import polars as pl
import numpy as np
import re

from joblib import Parallel, delayed

# Start tracking performance
start_time = time.time()
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss / (1024 * 1024)

# Read and convert
raw_df = pl.read_csv("new_iproperty_dataset.csv")
raw_pd_df = raw_df.to_pandas()

# Save to SQLite
conn = sqlite3.connect("iproperty.db")
raw_pd_df.to_sql("raw_data", conn, if_exists="replace", index=False)

# Read into Pandas
df = pd.read_sql_query("SELECT * FROM raw_data", conn)

# Rename Area to Area Code
df.rename(columns={'Area': 'Area Code'}, inplace=True)

# Clean Property Title
df['Property Title'] = df['Property Title'].str.split(',').str[0].str.title()

# Clean Property Price
df['Property Price'] = (
    df['Property Price']
    .str.replace('RM', '', regex=False)
    .str.replace(',', '', regex=False)
)
df['Property Price'] = pd.to_numeric(df['Property Price'], errors='coerce')
df.rename(columns={'Property Price': 'Property Price (RM)'}, inplace=True)

# Clean Property Location
split_cols = df['Property Location'].str.split(', ', expand=True)
df.drop(columns=['Property Location'], inplace=True)
df.insert(2, 'Property Location (City)', split_cols[0])
df.insert(3, 'Property Location (State)', split_cols[1])

# Define parsing function
def parse_property_details(detail):
    if pd.isnull(detail):
        return [None, None, None]
    clean_detail = re.sub(r'[^\x00-\x7F]+', ' ', str(detail))
    type_match = re.search(r"^(.*?)Built-up", clean_detail, re.IGNORECASE)
    property_type = re.split(r'\s*\|\s*', type_match.group(1).strip())[0] if type_match else None
    area_match = re.search(r'Built[-\s]*up[^0-9]*([\d,]+)\s*sq\.?\s*ft', clean_detail, re.IGNORECASE)
    area = float(area_match.group(1).replace(',', '')) if area_match else None
    furnishing = "Unknown"
    if re.search(r'\bUnfurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Unfurnished'
    elif re.search(r'\bPartially Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Partially Furnished'
    elif re.search(r'\bFully Furnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Fully Furnished'
    elif re.search(r'\bFurnished\b', clean_detail, re.IGNORECASE):
        furnishing = 'Furnished'
    return [property_type, area, furnishing]

# Parallel cleaning
results = Parallel(n_jobs=-1)(delayed(parse_property_details)(d) for d in df['Property Details'])
df[['Property Type', 'Property Size (sqft)', 'Property Furnishing Status']] = pd.DataFrame(results)

# Drop original column
df.drop(columns=['Property Details'], inplace=True)

# Clean Agent
df['Property Agent'] = df['Property Agent'].str.title()
df['Property Agent'] = df['Property Agent'].apply(
    lambda x: None if re.search(r'\bsdn\.?\s*bhd\.?\b', str(x), re.IGNORECASE) else x
)

# Final cleaning steps
df.dropna(subset=['Property Price (RM)'], inplace=True)
df.dropna(subset=['Property Type'], inplace=True)
df = df[df['Property Size (sqft)'] >= 70]
df.drop_duplicates(inplace=True)

# Save cleaned data
df.to_sql("cleaned_data", conn, if_exists="replace", index=False)

# End tracking
end_time = time.time()
mem_after = process.memory_info().rss / (1024 * 1024)
total_runtime = end_time - start_time
mem_used = mem_after - mem_before
total_records = df.shape[0]
throughput = total_records / total_runtime if total_runtime > 0 else 0

# Fancy output
print("📊 Performance Summary")
print(f"📦 Total Records Processed: {total_records}")
print(f"⏱️ Total Processing Time: {total_runtime:.2f} seconds")
print(f"🛠️ Memory Used (Before ➜ After): {mem_before:.2f} MB ➜ {mem_after:.2f} MB")
print(f"🔺 Peak Memory Usage: {mem_used:.2f} MB")
print(f"📈 Throughput: {throughput:.2f} records/second")


📊 Performance Summary
📦 Total Records Processed: 144214
⏱️ Total Processing Time: 20.19 seconds
🛠️ Memory Used (Before ➜ After): 182.28 MB ➜ 567.96 MB
🔺 Peak Memory Usage: 385.68 MB
📈 Throughput: 7141.52 records/second
