In [0]:
import pandas as pd

raw_data = [
    # valid
    {"user_id": "U001", "status": "Active", "date": "2026-02-01", "monthly_fee": 16.99, "age": 24},
    # invalid age (negative)
    {"user_id": "U002", "status": "Inactive", "date": "2026-01-15", "monthly_fee": 27.99, "age": -7},
    # missing user_id
    {"user_id": None, "status": "Active", "date": "2026-02-03", "monthly_fee": 49.99, "age": 30},
    # duplicate user_id
    {"user_id": "U001", "status": "ACTIVE", "date": "2026-02-04", "monthly_fee": 16.99, "age": 28},
    # valid
    {"user_id": "U003", "status": "inactive", "date": "2026-01-20", "monthly_fee": None, "age": 22},
    # invalid age (string)
    {"user_id": "U004", "status": "Active", "date": "2026-02-05", "monthly_fee": 27.99, "age": "twenty"},
    # missing user_id
    {"user_id": None, "status": "Inactive", "date": "2026-01-25", "monthly_fee": 23.99, "age": 35},
    # duplicate user_id
    {"user_id": "U003", "status": "Active", "date": "2026-02-06", "monthly_fee": 44.99, "age": 40},
    # valid
    {"user_id": "U005", "status": "Inactive", "date": "2026-01-30", "monthly_fee": 28.99, "age": 27},
    # unrealistic age
    {"user_id": "U006", "status": "Active", "date": "2026-02-02", "monthly_fee": 17.99, "age": 250},
]

df = pd.DataFrame(raw_data)
df.to_csv("/tmp/subscription.csv", index=False)

 Clean and validate data

In [0]:
# Standardise status to lowercase
df["status"] = df["status"].str.lower()

# Convert date columns to proper datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Replace missing monthly_fee with the median fee
median_fee = df["monthly_fee"].median()
df["monthly_fee"] = df["monthly_fee"].fillna(median_fee)

# Ensure age is numeric (invalid → NaN)
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df.loc[(df["age"] < 0) | (df["age"] > 120), "age"] = None

# Drop records where user_id is missing or duplicated
df = df.dropna(subset=["user_id"])
df = df.drop_duplicates(subset=["user_id"])

Engineer new analytical features and answer business questions

In [0]:
# subscription_length_days
from datetime import datetime

today = pd.Timestamp("2026-02-06")
df["subscription_length_days"] = (today - df["date"]).dt.days

# is_active_subscription (1/0)
df["is_active_subscription"] = (df["status"] == "active").astype(int)

# revenue_estimate = monthly_fee × months_active
df["months_active"] = (df["subscription_length_days"] / 30).astype(int) # months_active
df["revenue_estimate"] = df["monthly_fee"] * df["months_active"]


Integrate an external public API with error handling

Load cleaned data into a relational database