In [16]:
import re
import pandas as pd

def norm_name(name: str) -> str:
    if pd.isna(name): return ""
    s = str(name).lower()
    s = re.sub(r"[^a-z\s]", "", s)   # drop punctuation
    s = (s.replace(" jr","")
           .replace(" ii","")
           .replace(" iii","")
           .replace(" iv",""))
    return re.sub(r"\s+", " ", s).strip()


In [17]:
drafts = pd.read_csv("cleaned_draft_history.csv")
adps   = pd.read_csv("data/2016-2024_slim/fantasypros_adp_slim.csv")
fin    = pd.read_csv("data/2016-2024_slim/fantasypros_leaders_slim.csv")

# normalize names
for df in [drafts, adps, fin]:
    df["player_norm"] = df["Player"].map(norm_name)

In [3]:
import pandas as pd, re

# --- your existing loads (unchanged) ---
drafts = pd.read_csv("cleaned_draft_history.csv")
adps   = pd.read_csv("data/2016-2024_slim/fantasypros_adp_slim.csv")          # Year, Player, POS, Rank
fin    = pd.read_csv("data/2016-2024_slim/fantasypros_leaders_slim.csv")      # #, Player, Pos, Year, AVG, TTL

# ----------------- normalize + aliases -----------------
def norm_name(x: str) -> str:
    if pd.isna(x): return ""
    s = str(x).lower()
    s = re.sub(r"[^a-z\s]", "", s)
    s = (s.replace(" jr","")
         .replace(" sr","")            # <— add SR removal
         .replace(" ii","")
         .replace(" iii","")
         .replace(" iv",""))
    return re.sub(r"\s+"," ", s).strip()

# Map fantasypros/draft variants -> a single canonical key

ALIASES = {
    "aaron jones": "aaron jones sr",
    "kyle pitts": "kyle pitts sr",
    "deebo samuel": "deebo samuel sr",
    "robby anderson": "robbie chosen",
    "will fuller v": "william fuller v",
    "nyheim hines": "nyheim miller-hines",
    "anthony richardson": "anthony richardson sr",
    "robby chosen": "robby anderson",
    "robbie anderson": "robby anderson",
    "lev eon bell": "le veon bell",  
    "steelers dst": "pittsburgh steelers dst",
    "ravens dst": "baltimore ravens dst",
    "cowboys dst": "dallas cowboys dst",
    "49ers dst": "san francisco 49ers dst",
    "bills dst": "buffalo bills dst",
    "broncos dst": "denver broncos dst",
    "saints dst": "new orleans saints dst",
    "dolphins dst": "miami dolphins dst",
    "rams dst": "los angeles rams dst",
    "jaguars dst": "jacksonville jaguars dst",
    "chargers dst": "los angeles chargers dst",
    "patriots dst": "new england patriots dst",
    "buccaneers dst": "tampa bay buccaneers dst",
    "vikings dst": "minnesota vikings dst",
    "texans dst": "houston texans dst",
    "bears dst": "chicago bears dst",
    "colts dst": "indianapolis colts dst",
    "jets dst": "new york jets dst",
    "chiefs dst": "kansas city chiefs dst",
    "browns dst": "cleveland browns dst",
 
}


def apply_alias(n: str) -> str:
    n2 = norm_name(n)
    return ALIASES.get(n2, n2)

# ---- Draft player parsing (unchanged from your latest, abbreviated here) ----
pat = re.compile(r"^(?P<name>.+?)\s+(?P<team>[A-Za-z]{2,4})\s*,\s*(?P<pos>QB|RB|WR|TE|K|DST)$", re.I)
def parse_draft_player(s: str):
    s = str(s).strip()
    m = pat.match(s)
    if m: return m.group("name").strip(), m.group("team").upper(), m.group("pos").upper()
    if "," in s:
        left, pos = s.rsplit(",", 1)
        parts = left.rsplit(" ", 1)
        if len(parts) == 2:
            return parts[0].strip(), parts[1].upper(), pos.strip().upper()
    return s, "", ""

DST_FULL = {
    "ARI":"Arizona Cardinals D/ST","ATL":"Atlanta Falcons D/ST","BAL":"Baltimore Ravens D/ST",
    "BUF":"Buffalo Bills D/ST","CAR":"Carolina Panthers D/ST","CHI":"Chicago Bears D/ST",
    "CIN":"Cincinnati Bengals D/ST","CLE":"Cleveland Browns D/ST","DAL":"Dallas Cowboys D/ST",
    "DEN":"Denver Broncos D/ST","DET":"Detroit Lions D/ST","GB":"Green Bay Packers D/ST","GNB":"Green Bay Packers D/ST",
    "HOU":"Houston Texans D/ST","IND":"Indianapolis Colts D/ST","JAX":"Jacksonville Jaguars D/ST","JAC":"Jacksonville Jaguars D/ST",
    "KC":"Kansas City Chiefs D/ST","LAC":"Los Angeles Chargers D/ST","LAR":"Los Angeles Rams D/ST",
    "LV":"Las Vegas Raiders D/ST","LVR":"Las Vegas Raiders D/ST","MIA":"Miami Dolphins D/ST",
    "MIN":"Minnesota Vikings D/ST","NE":"New England Patriots D/ST","NO":"New Orleans Saints D/ST","NOR":"New Orleans Saints D/ST",
    "NYG":"New York Giants D/ST","NYJ":"New York Jets D/ST","PHI":"Philadelphia Eagles D/ST",
    "PIT":"Pittsburgh Steelers D/ST","SEA":"Seattle Seahawks D/ST","SF":"San Francisco 49ers D/ST","SFO":"San Francisco 49ers D/ST",
    "TB":"Tampa Bay Buccaneers D/ST","TEN":"Tennessee Titans D/ST","WAS":"Washington Commanders D/ST","WSH":"Washington Commanders D/ST",
    "OAK":"Oakland Raiders D/ST","SD":"San Diego Chargers D/ST","STL":"St. Louis Rams D/ST","LA":"Los Angeles Rams D/ST",
}

# draft-side match name (handles DSTs)
name_team_pos = drafts["Player"].map(parse_draft_player)
drafts["player_name_raw"] = name_team_pos.map(lambda t: t[0])
drafts["nfl_team_code"]  = name_team_pos.map(lambda t: t[1])
drafts["pos"]            = name_team_pos.map(lambda t: t[2])

def draft_match_name(row):
    if row["pos"] == "DST" and row["nfl_team_code"] in DST_FULL:
        return DST_FULL[row["nfl_team_code"]]
    # also handle mascot-form “Cowboys D/ST”
    if row["pos"] == "DST":
        raw = str(row["player_name_raw"])
        m = re.match(r"^\s*([A-Za-z0-9' ]+)\s+D/ST\s*$", raw, re.I)
        if m:
            masc = m.group(1).strip().lower()
            # quick map via DST_FULL values
            for full in DST_FULL.values():
                if masc in full.lower():
                    return full
    return row["player_name_raw"]

drafts["player_name"] = drafts.apply(draft_match_name, axis=1)
drafts["player_name"] = drafts["player_name"].fillna(drafts["Player"].astype(str))
drafts["player_norm"] = drafts["player_name"].map(apply_alias)

# ADP / Fin normalizations
adps = adps.rename(columns={"Rank":"ADP_Rank"})
for c in ["ADP_Rank","Year"]: adps[c] = pd.to_numeric(adps[c], errors="coerce")
adps["player_norm"] = adps["Player"].map(apply_alias)

fin  = fin.rename(columns={"#":"FinishRank","Pos":"POS","TTL":"FantasyPoints"})
for c in ["FinishRank","FantasyPoints","Year"]: fin[c] = pd.to_numeric(fin[c], errors="coerce")
fin["player_norm"] = fin["Player"].map(apply_alias)

# dedupe right tables
adps_u = (adps.sort_values(["Year","ADP_Rank"])
              .drop_duplicates(["Year","player_norm"], keep="first"))
fin_u  = (fin.sort_values(["Year","FantasyPoints"], ascending=[True, False])
             .drop_duplicates(["Year","player_norm"], keep="first"))

# merge
df = (drafts
      .merge(adps_u[["Year","player_norm","ADP_Rank"]], on=["Year","player_norm"], how="left")
      .merge(fin_u[["Year","player_norm","FinishRank","FantasyPoints"]], on=["Year","player_norm"], how="left")
)

print("Draft rows before merge:", len(drafts))
print("Rows after merge:", len(df))
print("Missing ADP_Rank:", df["ADP_Rank"].isna().sum())
print("Missing FantasyPoints:", df["FantasyPoints"].isna().sum())

# top 30 unmatched after alias fixes
df["_miss_adp"]  = df["ADP_Rank"].isna()
df["_miss_fin"]  = df["FantasyPoints"].isna()
df["_miss_any"]  = df["_miss_adp"] | df["_miss_fin"]
df["_miss_both"] = df["_miss_adp"] & df["_miss_fin"]

print("\nTop 30 unmatched (missing ADP or Finish):")
print(
    (df.loc[df["_miss_any"]]
       .groupby("player_name", dropna=False)
       .size()
       .sort_values(ascending=False)
       .head(30)
       .reset_index(name="missing_rows"))
    .to_string(index=False)
)

print("\nUnmatched BOTH (ADP & Finish) — top 30:")
print(
    (df.loc[df["_miss_both"]]
       .groupby("player_name", dropna=False)
       .size()
       .sort_values(ascending=False)
       .head(30)
       .reset_index(name="missing_both_rows"))
    .to_string(index=False)
)

# optional: spot-check a few renamed players
for nm in ["Aaron Jones", "Kyle Pitts", "Deebo Samuel", "Robby Anderson", "Will Fuller V", "Nyheim Hines", "Anthony Richardson"]:
    key = apply_alias(nm)
    print(f"\n--- {nm} (key: {key}) ---")
    print("ADP:", adps_u[adps_u["player_norm"]==key][["Year","Player","ADP_Rank"]].sort_values("Year").head(8).to_string(index=False))
    print("FIN:", fin_u [fin_u ["player_norm"]==key][["Year","Player","FinishRank","FantasyPoints"]].sort_values("Year").head(8).to_string(index=False))
    print("DRF:", drafts[drafts["player_norm"]==key][["Year","player_name","OverallPick"]].sort_values("Year").head(8).to_string(index=False))

# drop helper flags
df = df.drop(columns=["_miss_adp","_miss_fin","_miss_any","_miss_both"])


Draft rows before merge: 1448
Rows after merge: 1448
Missing ADP_Rank: 181
Missing FantasyPoints: 189

Top 30 unmatched (missing ADP or Finish):
      player_name  missing_rows
              nan            51
           Ravens             6
         Steelers             6
          Broncos             5
            49ers             5
            Bills             5
          Cowboys             5
             Rams             4
         Patriots             4
     Nyheim Hines             4
          Jaguars             4
           Saints             4
         Chargers             4
         Dolphins             4
       Buccaneers             3
            Bears             3
           Texans             3
          Vikings             3
 Larry Fitzgerald             3
Melvin Gordon III             3
     Le'Veon Bell             3
   Todd Gurley II             3
            Colts             3
           Eagles             2
           Giants             2
   Delanie Walker      