In [None]:
# parameters
PROJECT_ID = "nice-proposal-467718-q6"
REGION = "us-west1"
BRONZE_PATH = "gs://meu-bucket-premier/bronze/"
SILVER_PATH = "gs://meu-bucket-premier/silver/"
GOLD_PATH = "gs://meu-bucket-premier/gold/"
RUN_TS = None


## Camada Gold — teams_players_1992_2024

**Objetivo:** publicar uma versão final da base de times/jogadores com **mascaramento de CNPJ** por clube, pronta para consumo.

**Passos:**
1. Leitura da Silver (`teams_players_1992_2024_silver.csv`).
2. Normalização do nome do time.
3. Geração de **CNPJ sintético determinístico** por clube (com dígitos verificadores).
4. Criação de **`cnpj_mascarado`** (mantém só 4 últimos dígitos).
5. Publicação de dois artefatos:
   - **Fato enriquecido:** `gold/epl/teams_players_1992_2024/teams_players_1992_2024_gold.csv`
   - **Dimensão de clubes:** `gold/epl/dim_teams/dim_teams.csv`

> Observação: CNPJ é **sintético**, gerado a partir do nome do clube (sem PII real), cumprindo a prática de mascaramento/privacidade.


In [20]:
# ===== Gold: teams_players_1992_2024 (CNPJ sintético + mascarado) =====
%pip install -q pandas gcsfs

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd, hashlib, re, gcsfs
from datetime import datetime, timezone

fs = gcsfs.GCSFileSystem(token="cloud")

In [22]:
BUCKET = "gs://meu-bucket-premier"
SILVER = f"{BUCKET}/silver/epl/teams_players_1992_2024/teams_players_1992_2024_silver.csv"

GOLD_FACT = f"{BUCKET}/gold/epl/teams_players_1992_2024/teams_players_1992_2024_gold.csv"
GOLD_DIM  = f"{BUCKET}/gold/epl/dim_teams/dim_teams.csv"

In [23]:
# ---------- helpers ----------
def pick(cols, *cands):
    cl = [c.lower() for c in cols]
    for cand in cands:
        if cand.lower() in cl: return cols[cl.index(cand.lower())]
    for cand in cands:
        for i,c in enumerate(cl):
            if cand.lower() in c: return cols[i]
    return None

def _digits_from_hash(text: str, n: int, salt: str) -> list[int]:
    h = hashlib.sha256((salt + "::" + str(text)).encode()).hexdigest()
    digs = [int(ch) for ch in re.sub(r"\D","", h)]
    while len(digs) < n:
        digs += digs
    return digs[:n]

def cnpj_from_text(text: str, salt: str = "cnpj-salt") -> str:
    """Gera CNPJ sintético determinístico, com dígitos verificadores válidos."""
    b = _digits_from_hash(text, 12, salt)                     # base 12 dígitos
    w1 = [5,4,3,2,9,8,7,6,5,4,3,2]
    dv1 = 11 - (sum(d*w for d,w in zip(b, w1)) % 11)
    dv1 = 0 if dv1 >= 10 else dv1
    w2 = [6] + w1
    dv2 = 11 - (sum(d*w for d,w in zip(b + [dv1], w2)) % 11)
    dv2 = 0 if dv2 >= 10 else dv2
    nums = b + [dv1, dv2]
    return f"{nums[0]}{nums[1]}.{nums[2]}{nums[3]}{nums[4]}.{nums[5]}{nums[6]}{nums[7]}/{nums[8]}{nums[9]}{nums[10]}{nums[11]}-{nums[12]}{nums[13]}"

def mask_left(value: str, keep_right: int, mask_char="*") -> str:
    if not isinstance(value, str): return value
    return mask_char*(len(value)-keep_right) + value[-keep_right:]

In [24]:
# ---------- ler Silver ----------
df = pd.read_csv(fs.open(SILVER, "rb"))
cols = list(df.columns)

# detectar nome da coluna de time
col_team = pick(cols, "Team", "Team Name", "team", "team_name", "club", "squad")
if col_team is None:
    raise ValueError(f"Não achei coluna de time na Silver. Colunas: {cols}")

# limpeza leve do nome
df[col_team] = df[col_team].astype(str).str.strip().str.replace("_", " ", regex=False)

In [25]:
# ---------- gerar CNPJ determinístico + mascarado ----------
df["cnpj"] = df[col_team].apply(cnpj_from_text)
df["cnpj_mascarado"] = df["cnpj"].apply(lambda x: mask_left(x, keep_right=4))

# metadado
df["_processing_ts"] = datetime.now(timezone.utc).isoformat(timespec="seconds")

In [26]:
# ---------- salvar FATO enriquecido ----------
with fs.open(GOLD_FACT, "wb") as f:
    df.to_csv(f, index=False)
print("Gold (fato) salva em:", GOLD_FACT)

Gold (fato) salva em: gs://meu-bucket-premier/gold/epl/teams_players_1992_2024/teams_players_1992_2024_gold.csv


In [27]:
# ---------- salvar DIM_teams única (Team, cnpj_mascarado) ----------
dim = (
    df[[col_team, "cnpj_mascarado"]]
    .drop_duplicates()
    .sort_values(col_team)
    .reset_index(drop=True)
)
with fs.open(GOLD_DIM, "wb") as f:
    dim.to_csv(f, index=False)
print(" Gold (dim_teams) salva em:", GOLD_DIM)

 Gold (dim_teams) salva em: gs://meu-bucket-premier/gold/epl/dim_teams/dim_teams.csv


In [30]:
df.head()

Unnamed: 0,Team Name,Team ID,Season_1992,Season_1993,Season_1994,Season_1995,Season_1996,Season_1997,Season_1998,Season_1999,...,Season_2019,Season_2020,Season_2021,Season_2022,Season_2023,Season_2024,_dataset,cnpj,cnpj_mascarado,_processing_ts
0,Middlesbrough FC,641.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,teams_players_1992_2024,51.277.397/5746-13,**************6-13,2025-08-13T18:55:55+00:00
1,Manchester City,281.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,teams_players_1992_2024,36.729.400/6840-20,**************0-20,2025-08-13T18:55:55+00:00
2,Wimbledon FC ( ),114309.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,teams_players_1992_2024,05.118.211/7568-53,**************8-53,2025-08-13T18:55:55+00:00
3,Arsenal FC,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,teams_players_1992_2024,35.111.489/0146-18,**************6-18,2025-08-13T18:55:55+00:00
4,Nottingham Forest,703.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,teams_players_1992_2024,29.865.471/9779-46,**************9-46,2025-08-13T18:55:55+00:00
