In [3]:
#!/usr/bin/env python3
"""
Generate per-campaign distribution lists (CSV) using the *exact* campaign name
from the DB in the output filenames.

Output path pattern (Windows/Linux-safe):
  data/distribution_list_<EXACT CAMPAIGN NAME>.csv

Eligibility rule: include recipients that do NOT have events in
{unsubscribe, complaint, deleted_or_spam}.

Notes
-----
- We preserve the campaign name exactly for the CSV filename. Only characters
  invalid on Windows ("<>:\"/\\|?*") are stripped. Spaces and parentheses are kept.
- By default, suppression is per-campaign. You can enable cross-campaign
  suppression so that any negative event suppresses the email globally.
- If your Streamlit app writes to a path missing ".csv", Pandas will treat the
  *dirname* as the parent directory. Ensure you append ".csv" in your app.
"""

from __future__ import annotations

import os
import re
import sqlite3
from dataclasses import dataclass
from typing import Dict, List, Set

import pandas as pd


# ──────────────────────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────────────────────

PATH_CAMPAIGNS_DB: str = "campaigns.db"
PATH_EMAIL_MAP_DB: str = "email_map.db"
PATH_EMAIL_EVENTS_DB: str = "email_events.db"

# Negative events to suppress
NEGATIVE_EVENTS: Set[str] = {"unsubscribe", "complaint", "deleted_or_spam"}

# If True, any email with a negative event in ANY campaign is removed from ALL lists.
CROSS_CAMPAIGN_SUPPRESSION: bool = False

# Output root folder
OUTPUT_ROOT: str = "data"

# Keep CSVs with a single 'email' column
EMAIL_ONLY: bool = True


# ──────────────────────────────────────────────────────────────
# Data access
# ──────────────────────────────────────────────────────────────

@dataclass
class Tables:
    campaigns: pd.DataFrame
    email_map: pd.DataFrame
    event_log: pd.DataFrame


def read_tables() -> Tables:
    with sqlite3.connect(PATH_CAMPAIGNS_DB) as cdb:
        campaigns = pd.read_sql("SELECT campaign_id, name, start_date, end_date, budget FROM campaigns", cdb)
    with sqlite3.connect(PATH_EMAIL_MAP_DB) as emdb:
        email_map = pd.read_sql("SELECT campaign_id, msg_id, recipient, variant, send_ts FROM email_map", emdb)
    with sqlite3.connect(PATH_EMAIL_EVENTS_DB) as eedb:
        event_log = pd.read_sql("SELECT campaign_id, msg_id, event_type, event_ts FROM event_log", eedb)

    campaigns["start_date"] = pd.to_datetime(campaigns["start_date"], errors="coerce")
    campaigns["end_date"] = pd.to_datetime(campaigns["end_date"], errors="coerce")
    email_map["send_ts"] = pd.to_datetime(email_map["send_ts"], errors="coerce")
    event_log["event_ts"] = pd.to_datetime(event_log["event_ts"], errors="coerce")

    return Tables(campaigns=campaigns, email_map=email_map, event_log=event_log)


# ──────────────────────────────────────────────────────────────
# Core
# ──────────────────────────────────────────────────────────────

INVALID_WIN_CHARS = r'[<>:"/\\|?*]'  # Windows forbidden filename chars


def exact_filename(campaign_name: str) -> str:
    """
    Return 'distribution_list_<EXACT NAME>.csv' preserving the DB name,
    only stripping Windows-invalid characters.
    """
    safe_name = re.sub(INVALID_WIN_CHARS, "", campaign_name)
    # Trim spaces around
    safe_name = safe_name.strip()
    return f"distribution_list_{safe_name}.csv"


def compute_negative_msg_ids(event_log: pd.DataFrame) -> pd.Series:
    neg = event_log[event_log["event_type"].isin(NEGATIVE_EVENTS)]
    if neg.empty:
        return pd.Series([], dtype=object)
    return neg.groupby("msg_id")["event_type"].size().rename("neg_count")


def build_lists(t: Tables) -> Dict[str, pd.DataFrame]:
    neg_counts = compute_negative_msg_ids(t.event_log)  # msg_id → count
    em = t.email_map.merge(neg_counts, left_on="msg_id", right_index=True, how="left")
    em["has_negative"] = em["neg_count"].fillna(0).astype(int).gt(0)

    if CROSS_CAMPAIGN_SUPPRESSION:
        bad_emails = em.loc[em["has_negative"], "recipient"].drop_duplicates()
        em["globally_suppressed"] = em["recipient"].isin(set(bad_emails))
        em["eligible"] = ~(em["has_negative"] | em["globally_suppressed"])
    else:
        em["eligible"] = ~em["has_negative"]

    # keep last send per (campaign_id, recipient)
    em = em.sort_values(["campaign_id", "recipient", "send_ts"], ascending=[True, True, False])
    em = em.drop_duplicates(subset=["campaign_id", "recipient"], keep="first")

    out: Dict[str, pd.DataFrame] = {}
    for cid, grp in em.groupby("campaign_id", sort=False):
        elig = grp.loc[grp["eligible"]].copy()
        if EMAIL_ONLY:
            df = elig[["recipient"]].rename(columns={"recipient": "email"}).drop_duplicates()
        else:
            df = elig[["msg_id", "recipient", "variant", "send_ts"]].rename(columns={"recipient": "email"})
        out[cid] = df.reset_index(drop=True)
    return out


def write_csvs(lists: Dict[str, pd.DataFrame], campaigns: pd.DataFrame) -> pd.DataFrame:
    os.makedirs(OUTPUT_ROOT, exist_ok=True)

    cmap = campaigns[["campaign_id", "name"]].drop_duplicates()
    rows: List[dict] = []
    for cid, df in lists.items():
        row = cmap.loc[cmap["campaign_id"] == cid]
        cname = row["name"].iloc[0] if not row.empty else cid

        fname = exact_filename(str(cname))
        fpath = os.path.join(OUTPUT_ROOT, fname)

        # Ensure parent exists
        os.makedirs(os.path.dirname(fpath), exist_ok=True)

        df.to_csv(fpath, index=False)

        rows.append(
            {"campaign_id": cid, "campaign_name": str(cname), "file": fpath, "num_emails": int(df.shape[0])}
        )
    return pd.DataFrame(rows)


def run() -> pd.DataFrame:
    t = read_tables()
    lists = build_lists(t)
    summary = write_csvs(lists, t.campaigns)
    return summary


if __name__ == "__main__":
    summary_df = run()
    print(summary_df.to_string(index=False))



                         campaign_id                         campaign_name                                                             file  num_emails
0020ae0d-ae0c-4411-8d7e-8a679a693406     Zero-Interest Medical Loan Scheme     data\distribution_list_Zero-Interest Medical Loan Scheme.csv         266
01810b99-4649-4efb-8a63-f36facf9774a             Visa Debit Card (Classic)             data\distribution_list_Visa Debit Card (Classic).csv         311
08597592-a5a1-4930-acd5-2a55f01dd833                Green Financing Scheme                data\distribution_list_Green Financing Scheme.csv         263
0cda546d-c332-4b80-b34d-89b304ba2517             MoneyGrow Savings Account             data\distribution_list_MoneyGrow Savings Account.csv         235
0eb1e7b2-ce22-4426-af69-459f13542104      Savings Account with Cheque Book      data\distribution_list_Savings Account with Cheque Book.csv          63
0fc2b9fe-0a9f-4979-88ff-b98925d11a89                       Savings Account              