# Hard 1 — Optimize Pandas Workloads

We'll compare row‑wise `apply` vs vectorization, and reduce memory by downcasting dtypes.

In [None]:

import pandas as pd
import numpy as np
import time

n = 100_000  # modest for execution; scale up locally for deeper benchmarking
rng = np.random.default_rng(42)
df = pd.DataFrame({
    "a": rng.integers(0, 1000, size=n),
    "b": rng.integers(0, 1000, size=n),
    "c": rng.random(size=n)
})

def timeit(fn, *args, **kwargs):
    t0 = time.time()
    out = fn(*args, **kwargs)
    t1 = time.time()
    return out, t1 - t0

# Row-wise apply (slow)
def slow_apply(d):
    return d.apply(lambda r: r["a"] * 2 + r["b"] - r["c"], axis=1)

# Vectorized (fast)
def fast_vec(d):
    return d["a"] * 2 + d["b"] - d["c"]

_, t_apply = timeit(slow_apply, df.copy())
_, t_vec = timeit(fast_vec, df.copy())

print(f"Row-wise apply time: {t_apply:.3f}s")
print(f"Vectorized time   : {t_vec:.3f}s")

# Memory optimization via downcasting
before = df.memory_usage(deep=True).sum()
df_opt = df.copy()
df_opt["a"] = pd.to_numeric(df_opt["a"], downcast="unsigned")
df_opt["b"] = pd.to_numeric(df_opt["b"], downcast="unsigned")
df_opt["c"] = pd.to_numeric(df_opt["c"], downcast="float")

after = df_opt.memory_usage(deep=True).sum()
print(f"Memory before: {before/1e6:.2f} MB")
print(f"Memory after : {after/1e6:.2f} MB")
print("Savings      :", f"{(1 - after/before)*100:.1f}%")



**Takeaways:**
- Prefer vectorized arithmetic, `.where`, `.mask`, and `np.select` over row‑wise `apply`.
- Downcast numeric dtypes and convert repeated strings to `category` to cut memory.
- Use `.assign`, `.eval`, `.query` for concise, sometimes faster expressions.
- Chunk large CSVs (`chunksize=`) and consider Parquet for faster IO.
