In [1]:
from dask.distributed import Client
import dask.dataframe as dd
import pandas as pd
import numpy as np
import time
import psutil
import os

# ── Start Dask client ──
client = Client()

# ── Timer start ──
start_time = time.time()

# ── Load dataset ──
file_path = 'pets_combined.csv'  # Uploaded file
df = dd.read_csv(file_path)

# ── Split composite columns ──
df['Gender'] = df['Profile'].str.split(';').str.get(0)
df['Age'] = df['Profile'].str.split(';').str.get(1)

df['Body Size'] = df['Body'].str.split(';').str.get(0)
df['Fur Length'] = df['Body'].str.split(';').str.get(1)

df['Original Date'] = df['Posted'].str.split(';').str.get(0)
df['Updated Date'] = df['Posted'].str.split(';').str.get(1)
df['Updated Date'] = df['Updated Date'].fillna(df['Original Date'])

# ── Date conversion ──
df['Original Date'] = dd.to_datetime(df['Original Date'], errors='coerce')
df['Updated Date'] = dd.to_datetime(df['Updated Date'], errors='coerce')

# ── Yes/No columns ──
for col in ['Vaccinated', 'Dewormed', 'Spayed']:
    df[col] = df[col].map(lambda x: 1 if str(x).strip().lower() == 'yes' else 0, meta=('x', 'int64'))

# ── Amount column ──
df['Amount'] = df['Amount'].map(lambda x: pd.to_numeric(x, errors='coerce'), meta=('Amount', 'float64'))
df['Amount'] = df['Amount'].fillna(1).astype('int64')

# ── Price cleaning ──
def map_price(x):
    x = str(x).strip().upper()
    if x == 'FREE':
        return 0
    elif x.startswith('RM'):
        try:
            return int(x.replace('RM', '').strip())
        except:
            return np.nan
    else:
        return np.nan

df['Price'] = df['Price'].map(map_price, meta=('Price', 'float64'))

# ── Pet ID cleaning ──
df['Pet ID'] = df['Pet ID'].map(lambda x: pd.to_numeric(x, errors='coerce'), meta=('Pet ID', 'float64')).astype('int64')

# ── Execute computation ──
df_cleaned = df.compute()

# ── Performance Metrics ──
end_time = time.time()
elapsed = end_time - start_time
memory_usage = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
cpu_usage = psutil.cpu_percent(interval=1)
throughput = len(df_cleaned) / elapsed if elapsed > 0 else 0

# ── Output ──
print("✅ Cleaned dataset shape:", df_cleaned.shape)
print(f"\n⏱️ Time taken: {elapsed:.2f} seconds")
print(f"🧠 Memory usage: {memory_usage:.2f} MB")
print(f"⚙️ CPU usage: {cpu_usage}%")
print(f"📈 Throughput: {throughput:.2f} records/sec")

df_cleaned.head()


INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:43937
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41991'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41929'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:37707 name: 1
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:37707
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:57412
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:39621 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:39621
INFO:distributed.core:Starting established connection to tcp://127

✅ Cleaned dataset shape: (13084, 24)

⏱️ Time taken: 4.68 seconds
🧠 Memory usage: 302.16 MB
⚙️ CPU usage: 10.0%
📈 Throughput: 2794.67 records/sec


Unnamed: 0,Pet ID,Name,Type,Species,Profile,Amount,Vaccinated,Dewormed,Spayed,Condition,...,Price,Uploader Type,Uploader Name,Status,Gender,Age,Body Size,Fur Length,Original Date,Updated Date
0,105500,Kitten 1 And Kitten 2,Dog,Mixed Breed,"Female, 4 Years 3 Months",1,0,1,0,Healthy,...,,Pet Merchant,Pet610,For Adoption,"Female, 4 Years 3 Months",,"Small Size, Medium Fur",,NaT,NaT
1,105501,Igor,Cat,Mixed Breed,"Male, 4 Years 3 Months",1,0,1,0,Healthy,...,,Pet Merchant,FurryMe,For Adoption,"Male, 4 Years 3 Months",,"Medium Size, Short Fur",,NaT,NaT
2,105502,PF105502,Cat,Mixed Breed,"Male, 6 Years 1 Month",1,0,0,0,Healthy,...,0.0,Owner,AdoptFriend,For Adoption,"Male, 6 Years 1 Month",,"Medium Size, Short Fur",,2021-03-29,2021-03-29
3,105503,Julie,Rabbit,Mixed Breed,"Female, 5 Years 7 Months",1,1,1,0,Healthy,...,0.0,Rescuer,Paws511,Adopted,"Female, 5 Years 7 Months",,"Medium Size, Medium Fur",,NaT,NaT
4,105504,Cookie,Cat,Mixed Breed,"Female, 4 Years 4 Months",1,0,1,1,Healthy,...,0.0,Rescuer,PawsFriend,Adopted,"Female, 4 Years 4 Months",,"Medium Size, Short Fur",,NaT,NaT
