In [None]:
# parameters
PROJECT_ID = "nice-proposal-467718-q6"
REGION = "us-west1"
BRONZE_PATH = "gs://meu-bucket-premier/bronze/"
SILVER_PATH = "gs://meu-bucket-premier/silver/"
GOLD_PATH = "gs://meu-bucket-premier/gold/"
RUN_TS = None


In [1]:
import pandas as pd, hashlib, re
import gcsfs

# ===== CONFIG =====
BUCKET = "gs://meu-bucket-premier"

# Ajuste para os caminhos reais da sua Silver:
SILVER_PLAYERS = f"{BUCKET}/silver/epl/player_stats_all_time/player_stats_all_time_silver.csv"
SILVER_TEAMS   = f"{BUCKET}/silver/epl/teams_players_1992_2024/teams_players_1992_2024_silver.csv"

# Saídas na GOLD:
GOLD_PLAYERS_OUT = f"{BUCKET}/gold/epl/players_with_cpf/players_with_cpf.csv"
GOLD_TEAMS_OUT   = f"{BUCKET}/gold/epl/teams_with_cnpj/teams_with_cnpj.csv"

fs = gcsfs.GCSFileSystem(token="cloud")

# ===== Helpers determinísticos =====
def _digits_from_hash(text: str, n: int, salt: str) -> list[int]:
    h = hashlib.sha256((salt + "::" + str(text)).encode()).hexdigest()
    digs = [int(ch) for ch in re.sub(r"\D","",h)]
    # recicla se faltar
    while len(digs) < n: digs += digs
    return digs[:n]

def cpf_from_text(text: str, salt: str = "cpf-salt") -> str:
    # base 9 dígitos
    b = _digits_from_hash(text, 9, salt)
    # dv1
    s = sum(d * w for d, w in zip(b, range(10, 1, -1)))
    dv1 = (s * 10) % 11; dv1 = 0 if dv1 == 10 else dv1
    # dv2
    s = sum(d * w for d, w in zip(b + [dv1], range(11, 1, -1)))
    dv2 = (s * 10) % 11; dv2 = 0 if dv2 == 10 else dv2
    nums = b + [dv1, dv2]
    return f"{nums[0]}{nums[1]}{nums[2]}.{nums[3]}{nums[4]}{nums[5]}.{nums[6]}{nums[7]}{nums[8]}-{nums[9]}{nums[10]}"

def cnpj_from_text(text: str, salt: str = "cnpj-salt") -> str:
    # base 12 dígitos
    b = _digits_from_hash(text, 12, salt)
    # dv1
    w1 = [5,4,3,2,9,8,7,6,5,4,3,2]
    s = sum(d*w for d,w in zip(b, w1))
    dv1 = 11 - (s % 11); dv1 = 0 if dv1 >= 10 else dv1
    # dv2
    w2 = [6] + w1
    s = sum(d*w for d,w in zip(b + [dv1], w2))
    dv2 = 11 - (s % 11); dv2 = 0 if dv2 >= 10 else dv2
    nums = b + [dv1, dv2]
    return f"{nums[0]}{nums[1]}.{nums[2]}{nums[3]}{nums[4]}.{nums[5]}{nums[6]}{nums[7]}/{nums[8]}{nums[9]}{nums[10]}{nums[11]}-{nums[12]}{nums[13]}"

def mask_left(value: str, keep_right: int, mask_char="*") -> str:
    if not isinstance(value, str): return value
    return mask_char*(len(value)-keep_right) + value[-keep_right:]

# ===== Detectores de coluna (jogador/time) =====
def pick(cols, *cands):
    cl = [c.lower() for c in cols]
    for cand in cands:
        if cand.lower() in cl:
            return cols[cl.index(cand.lower())]
    for cand in cands:
        for i,c in enumerate(cl):
            if cand.lower() in c:
                return cols[i]
    return None

# ===== 1) Jogadores: gerar CPF e mascarar =====
players = pd.read_csv(fs.open(SILVER_PLAYERS, "rb"))
col_player = pick(players.columns, "player", "player_name", "name", "athlete")
if col_player is None:
    raise ValueError(f"Não achei coluna de jogador em {SILVER_PLAYERS}. Colunas: {players.columns.tolist()}")

players["cpf"] = players[col_player].astype(str).apply(cpf_from_text)
players["cpf_mascarado"] = players["cpf"].apply(lambda x: mask_left(x, keep_right=4))  # ***.***.***-*234
players.to_csv(fs.open(GOLD_PLAYERS_OUT, "wb"), index=False)
print("✅ Players com CPF salvo em:", GOLD_PLAYERS_OUT)

# ===== 2) Clubes: gerar CNPJ e mascarar =====
teams_df = pd.read_csv(fs.open(SILVER_TEAMS, "rb"))
col_team = pick(teams_df.columns, "team", "team_name", "club", "squad")
if col_team is None:
    raise ValueError(f"Não achei coluna de time em {SILVER_TEAMS}. Colunas: {teams_df.columns.tolist()}")

teams_df["cnpj"] = teams_df[col_team].astype(str).apply(cnpj_from_text)
teams_df["cnpj_mascarado"] = teams_df["cnpj"].apply(lambda x: mask_left(x, keep_right=4))  # **.***.***/****-1234
teams_df.to_csv(fs.open(GOLD_TEAMS_OUT, "wb"), index=False)
print("✅ Times com CNPJ salvo em:", GOLD_TEAMS_OUT)


✅ Players com CPF salvo em: gs://meu-bucket-premier/gold/epl/players_with_cpf/players_with_cpf.csv
✅ Times com CNPJ salvo em: gs://meu-bucket-premier/gold/epl/teams_with_cnpj/teams_with_cnpj.csv
