In [None]:
import pandas as pd
import numpy as np

# ============================================================
# Files (must be in the same directory as this script)
# ============================================================
AMCE_PATH = "amce_all_groups.csv"
DATA_PATH = "data.csv"

# ============================================================
# Settings
# ============================================================
# Rank: highest score = rank 1 (descending)
# Generate numeric unique IDs: 1..N
ID_COL = "obs_id"

# Map your dataset column names -> AMCE "Attribute" names
COL_TO_ATTRIBUTE = {
    "ArrivalYear": "Arrival",
    "City": "City",
    "AdultEdu": "Education",
    "HouseholdComp": "Demographics",
    "Disability": "Disability",
    "MedicalCond": "Medical",
    "HH_Assets": "Assets",
    "DebtSituation": "Debt",
    "Coping": "Coping",
    "AssistanceHistory": "Previous Assistance",
}

# Groups to compute
GROUPS = ["Beneficiary", "Model", "Field", "Central"]

# ============================================================
# Load AMCEs and build lookup per group
# ============================================================
amce = pd.read_csv(AMCE_PATH)

needed_cols = {"Group", "Attribute", "Level", "Estimate"}
missing_amce_cols = sorted(list(needed_cols - set(amce.columns)))
if missing_amce_cols:
    raise ValueError(
        f"amce_all_groups.csv is missing columns: {missing_amce_cols}. "
        f"Found columns: {list(amce.columns)}"
    )

amce = amce[["Group", "Attribute", "Level", "Estimate"]].copy()

# Normalize strings for robust matching (years, whitespace, etc.)
amce["Group"] = amce["Group"].astype(str).str.strip()
amce["Attribute"] = amce["Attribute"].astype(str).str.strip()
amce["Level"] = amce["Level"].astype(str).str.strip()

# Build: group -> {(attribute, level): estimate}
group_lookup = {}
for g, gdf in amce.groupby("Group", sort=False):
    d = {(row.Attribute, row.Level): float(row.Estimate) for row in gdf.itertuples(index=False)}
    group_lookup[g] = d

missing_groups = [g for g in GROUPS if g not in group_lookup]
if missing_groups:
    raise ValueError(
        f"These groups are missing in amce_all_groups.csv: {missing_groups}. "
        f"Found groups: {sorted(group_lookup.keys())}"
    )

# ============================================================
# Load data (5000 observations) and create unique IDs
# ============================================================
X = pd.read_csv(DATA_PATH)

required_cols = list(COL_TO_ATTRIBUTE.keys())
missing_data_cols = [c for c in required_cols if c not in X.columns]
if missing_data_cols:
    raise ValueError(
        f"data.csv is missing columns: {missing_data_cols}\n"
        f"Found columns: {list(X.columns)}"
    )

# Generate numeric unique IDs: 1..N
X = X.copy()
X[ID_COL] = np.arange(1, len(X) + 1, dtype=int)

# Normalize values to strings for matching vs AMCE "Level"
X_norm = X.copy()
for c in required_cols:
    X_norm[c] = X_norm[c].astype(str).str.strip()
    # Treat missing-like strings as empty => control => +0
    X_norm.loc[X_norm[c].isin(["nan", "NaN", "None", ""]), c] = ""

# ============================================================
# Scoring
# ============================================================
def score_observations_for_group(Xn: pd.DataFrame, lookup: dict) -> pd.Series:
    """
    Score each row by summing AMCE estimates for realized (Attribute, Level).
    If (Attribute, Level) isn't in AMCE => treated as control => +0.
    """
    scores = np.zeros(len(Xn), dtype=float)

    for col, attr in COL_TO_ATTRIBUTE.items():
        levels = Xn[col].to_numpy(dtype=str)
        add = np.fromiter(
            (lookup.get((attr, lvl), 0.0) for lvl in levels),
            dtype=float,
            count=len(levels),
        )
        scores += add

    return pd.Series(scores, index=Xn.index, name="score")

# ============================================================
# Compute scores + rankings for each group
# ============================================================
results = {}

for g in GROUPS:
    lookup = group_lookup[g]
    scores = score_observations_for_group(X_norm, lookup)

    tmp = pd.DataFrame({
        ID_COL: X_norm[ID_COL].values,
        "score": scores.values
    })

    # Rank: highest score first; stable tie-break by obs_id ascending
    ranked = tmp.sort_values(["score", ID_COL], ascending=[False, True], kind="mergesort").reset_index(drop=True)

    # Add explicit rank column (1 = best)
    ranked["rank"] = np.arange(1, len(ranked) + 1, dtype=int)

    results[g] = {
        "scores_table": tmp,                  # obs_id + score (original order)
        "ranking_vector": ranked[ID_COL].to_list(),  # ordered vector of obs_id
        "ranking_table": ranked,              # obs_id + score + rank (sorted)
    }

# ============================================================
# Outputs
# ============================================================
# 1) One CSV per group: obs_id, score, rank (sorted by rank)
for g in GROUPS:
    results[g]["ranking_table"].to_csv(f"ranking_{g.lower()}.csv", index=False)

# 2) One wide CSV: obs_id + all group scores (unsorted)
scores_wide = pd.DataFrame({ID_COL: X_norm[ID_COL].values})
for g in GROUPS:
    scores_wide[f"score_{g.lower()}"] = results[g]["scores_table"]["score"].values

scores_wide.to_csv("scores_all_groups.csv", index=False)

# 3) Convenience: print top-10 IDs per group
for g in GROUPS:
    print(f"Top 10 obs_id ({g}): {results[g]['ranking_vector'][:10]}")
