In [0]:
import random
import uuid
import datetime
from faker import Faker

from pyspark.sql import Row
from pyspark.sql import functions as F


In [0]:
fake = Faker()
random.seed(42)

TOTAL_ROWS = 1_000_000  # change to 100_000 for testing

categories = {
    "Painkiller": (20, 120),
    "Antibiotic": (50, 350),
    "Antacid": (40, 200),
    "Cough & Cold": (60, 180),
    "Enzymes": (80, 250),
    "BP": (90, 400),
    "Diabetes": (100, 500),
    "Immune Booster": (70, 300),
    "Anti-Allergy": (60, 200),
    "Anti-Spasmodic": (70, 220)
}

companies = [
    "Aristo", "Zydus", "Sanofi", "Lupin",
    "Sun Pharma", "Torrent", "Dr Reddy", "Glenmark"
]

medicine_forms = ["Tablet", "Capsule", "Syrup", "Injection", "Drops"]

symptoms_map = {
    "Painkiller": "Body Pain / Fever",
    "Antibiotic": "Infection",
    "Antacid": "Gastric Trouble",
    "Cough & Cold": "Cold / Cough",
    "Enzymes": "Digestion",
    "BP": "Hypertension",
    "Diabetes": "Diabetes",
    "Immune Booster": "Low Immunity",
    "Anti-Allergy": "Allergy",
    "Anti-Spasmodic": "Stomach Pain"
}


In [0]:
def random_mfg_date():
    start = datetime.date(2021, 1, 1)
    end = datetime.date(2024, 12, 31)
    return fake.date_between(start, end)

def expiry_from_mfg(mfg_date):
    years = random.randint(1, 4)
    return mfg_date + datetime.timedelta(days=365 * years)


In [0]:
rows = []

category_list = list(categories.keys())

for _ in range(TOTAL_ROWS):
    category = random.choice(category_list)
    price_min, price_max = categories[category]

    mfg = random_mfg_date()
    expiry = expiry_from_mfg(mfg)

    rows.append(Row(
        medicine_id=str(uuid.uuid4()),
        medicine_brand_name=fake.word().upper(),
        medicine_combination=fake.word().capitalize(),
        manufacturing_company=random.choice(companies),
        category=category,
        symptom=symptoms_map[category],
        medicine_form=random.choice(medicine_forms),
        mfg_date=mfg.strftime("%Y-%m"),
        expiry_date=expiry.strftime("%Y-%m"),
        batch_number=fake.bothify("??#####"),
        price_including_gst=random.randint(price_min, price_max)
    ))


In [0]:
df = spark.createDataFrame(rows)

df.printSchema()
df.show(5)


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.default.pharmacy_data;


In [0]:
delta_path = "/Volumes/workspace/default/pharmacy_data/generated_delta"

df.write.format("delta") \
  .mode("overwrite") \
  .save(delta_path)


In [0]:
csv_path = "/Volumes/workspace/default/pharmacy_data/generated_csv"

df.coalesce(1) \
  .write.mode("overwrite") \
  .option("header", True) \
  .csv(csv_path)


In [0]:
spark.read.format("delta").load(delta_path).count()
