# Preparing Data (Portfolio Version)

Purpose: prepare cleaned, analysis-ready CSVs for the revenue/pricing/margin project.

Outputs are written to `data/processed/` to keep the repository portable.


In [None]:
from pathlib import Path
import os

PROJECT_DIR = Path.cwd()
DATA_DIR = Path(os.getenv('DATA_DIR', PROJECT_DIR / 'data'))
RAW_DIR = Path(os.getenv('RAW_DIR', DATA_DIR / 'raw'))
PROCESSED_DIR = Path(os.getenv('PROCESSED_DIR', DATA_DIR / 'processed'))

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

def require_file(path: Path) -> Path:
    if not path.exists():
        raise FileNotFoundError(
            f'Missing input file: {path}\n'
            f'Put raw CSVs under {RAW_DIR} (or set RAW_DIR/DATA_DIR env vars).'
        )
    return path

def save_csv(df, filename: str) -> Path:
    out = PROCESSED_DIR / filename
    df.to_csv(out, index=False)
    return out


In [None]:
import pandas as pd

## Accounts

Load accounts and keep analysis-relevant columns.

In [None]:
ACCOUNTS_PATH = require_file(RAW_DIR / "ravenstack_accounts.csv")
accounts_df = pd.read_csv(ACCOUNTS_PATH)

accounts = (accounts_df.drop(columns=["account_name", "country", "referral_source"]))
accounts

out_path = save_csv(accounts, "accounts.csv")
print(f"Wrote: {out_path} | raw={accounts_df.shape} processed={accounts.shape}")


## Churn events

Remove columns not required for churn analysis.

In [None]:
CHURN_EVENTS_PATH = require_file(RAW_DIR / "ravenstack_churn_events.csv")
churn_events_df = pd.read_csv(CHURN_EVENTS_PATH)

churn_events = (churn_events_df.drop(columns=["refund_amount_usd", "is_reactivation", "feedback_text"]))
churn_events

out_path = save_csv(churn_events, "churn_events.csv")
print(f"Wrote: {out_path} | raw={churn_events_df.shape} processed={churn_events.shape}")
print("Reactivations:", int(churn_events_df.get("is_reactivation", pd.Series([])).sum()) if "is_reactivation" in churn_events_df.columns else "n/a")


## Feature usage

Drop low-signal operational columns to keep the dataset lean.

In [None]:
FEATURE_USAGE_PATH = require_file(RAW_DIR / "ravenstack_feature_usage.csv")
feature_usage_df = pd.read_csv(FEATURE_USAGE_PATH)

feature_usage = (feature_usage_df.drop(columns=["usage_duration_secs", "error_count", "is_beta_feature"]))
feature_usage

out_path = save_csv(feature_usage, "feature_usage.csv")
print(f"Wrote: {out_path} | raw={feature_usage_df.shape} processed={feature_usage.shape}")


## Subscriptions

Load subscriptions, persist a local copy, and compute derived pricing tables used later.

In [None]:
SUBSCRIPTIONS_PATH = require_file(RAW_DIR / "ravenstack_subscriptions.csv")
subscriptions_df = pd.read_csv(SUBSCRIPTIONS_PATH)

# Save canonical processed file (mirrors original behaviour)
out_path = save_csv(subscriptions_df, "subscriptions.csv")
print(f"Wrote: {out_path} | shape={subscriptions_df.shape}")
print("Unique accounts:", subscriptions_df["account_id"].nunique() if "account_id" in subscriptions_df.columns else "n/a")
print("Plan tiers:", subscriptions_df["plan_tier"].unique() if "plan_tier" in subscriptions_df.columns else "n/a")

# Derived reference tables (kept from original)
cost_per_plan = pd.DataFrame({
    "plan_tier": subscriptions_df["plan_tier"].unique(),
    "cost_per_seat": [26.79, 5.38, 3.47]
    })

average_revenue_per_seat = (
    subscriptions_df.groupby("plan_tier")
    .agg(
        total_seats = ("seats", "sum"),
        total_mrr = ("mrr_amount", "sum")
    )
    .assign(
        average_revenue_per_seat = lambda x: round(x["total_mrr"] / x["total_seats"],2)
    )
    .merge(
        cost_per_plan_df,
        how = "left",
        on = "plan_tier"
    )
    .assign(
        gross_margin_pct = lambda x: (1-(x["cost_per_seat"] / x["average_revenue_per_seat"])).round(2)
    )
)
average_revenue_per_seat

display(cost_per_plan)
display(average_revenue_per_seat)


## Support tickets

Support tickets are used as an operational signal in analysis.

In [None]:
SUPPORT_TICKETS_PATH = require_file(RAW_DIR / "ravenstack_support_tickets.csv")
support_tickets_df = pd.read_csv(SUPPORT_TICKETS_PATH)

out_path = save_csv(support_tickets_df, "support_tickets.csv")
print(f"Wrote: {out_path} | shape={support_tickets_df.shape}")
