In [None]:
# parameters
PROJECT_ID = "nice-proposal-467718-q6"
REGION = "us-west1"
BRONZE_PATH = "gs://meu-bucket-premier/bronze/"
SILVER_PATH = "gs://meu-bucket-premier/silver/"
GOLD_PATH = "gs://meu-bucket-premier/gold/"
RUN_TS = None


## Camada Gold — player_stats_all_time

**Objetivo:** gerar versão final da base de estatísticas de jogadores com **CPF sintético** por jogador, cumprindo requisitos de mascaramento de dados.

**Passos:**
1. Leitura da Silver (`player_stats_all_time_silver.csv`).
2. Normalização do nome do jogador.
3. Geração de **CPF sintético determinístico** (mesma entrada → mesmo CPF, com dígitos verificadores válidos).
4. Criação de **`cpf_mascarado`** (mantém apenas 3 últimos dígitos).
5. Publicação de dois artefatos:
   - **Fato enriquecido:** `gold/epl/player_stats_all_time/player_stats_all_time_gold.csv`
   - **Dimensão de jogadores:** `gold/epl/dim_players/dim_players.csv`

> Observação: CPF é sintético, não corresponde a dados pessoais reais.


In [13]:
# ===== Gold: player_stats_all_time (CPF sintético + mascarado) =====
%pip install -q pandas gcsfs

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd, hashlib, re, gcsfs
from datetime import datetime, timezone

fs = gcsfs.GCSFileSystem(token="cloud")

In [3]:
BUCKET = "gs://meu-bucket-premier"
SILVER = f"{BUCKET}/silver/epl/player_stats_all_time/player_stats_all_time_silver.csv"

GOLD_FACT = f"{BUCKET}/gold/epl/player_stats_all_time/player_stats_all_time_gold.csv"
GOLD_DIM  = f"{BUCKET}/gold/epl/dim_players/dim_players.csv"

In [4]:
# ---------- helpers ----------
def pick(cols, *cands):
    cl = [c.lower() for c in cols]
    for cand in cands:
        if cand.lower() in cl: return cols[cl.index(cand.lower())]
    for cand in cands:
        for i,c in enumerate(cl):
            if cand.lower() in c: return cols[i]
    return None

def _digits_from_hash(text: str, n: int, salt: str) -> list[int]:
    h = hashlib.sha256((salt + "::" + str(text)).encode()).hexdigest()
    digs = [int(ch) for ch in re.sub(r"\D","", h)]
    while len(digs) < n:
        digs += digs
    return digs[:n]

def cpf_from_text(text: str, salt: str = "cpf-salt") -> str:
    """Gera CPF sintético determinístico, com dígitos verificadores válidos."""
    b = _digits_from_hash(text, 9, salt)                     # base 9 dígitos
    w1 = list(range(10, 1, -1))
    dv1 = 11 - (sum(d*w for d,w in zip(b, w1)) % 11)
    dv1 = 0 if dv1 >= 10 else dv1
    w2 = list(range(11, 1, -1))
    dv2 = 11 - (sum(d*w for d,w in zip(b + [dv1], w2)) % 11)
    dv2 = 0 if dv2 >= 10 else dv2
    nums = b + [dv1, dv2]
    return f"{nums[0]}{nums[1]}{nums[2]}.{nums[3]}{nums[4]}{nums[5]}.{nums[6]}{nums[7]}{nums[8]}-{nums[9]}{nums[10]}"

def mask_left(value: str, keep_right: int, mask_char="*") -> str:
    if not isinstance(value, str): return value
    return mask_char*(len(value)-keep_right) + value[-keep_right:]

In [5]:
# ---------- ler Silver ----------
df = pd.read_csv(fs.open(SILVER, "rb"))
cols = list(df.columns)

In [6]:
df.head()

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides,_ingestion_ts,_dataset
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,17.0,28.0,375.0,489.0,2,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,0.0,0.0,0.0,0.0,23,0,125,8.0,2025-08-13T14:39:34+00:00,player_stats_all_time
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,0.0,0.0,0.0,0.0,2,0,9,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time


In [7]:
# detectar nome da coluna de jogador
col_player = pick(cols, "Player", "Name", "Full Name", "player_name")
if col_player is None:
    raise ValueError(f"Não achei coluna de jogador na Silver. Colunas: {cols}")

In [8]:
# limpeza leve
df[col_player] = df[col_player].astype(str).str.strip()

In [9]:
# ---------- gerar CPF determinístico + mascarado ----------
df["cpf"] = df[col_player].apply(cpf_from_text)
df["cpf_mascarado"] = df["cpf"].apply(lambda x: mask_left(x, keep_right=3))

In [10]:
# metadado
df["_processing_ts"] = datetime.now(timezone.utc).isoformat(timespec="seconds")

In [11]:
# ---------- salvar FATO enriquecido ----------
with fs.open(GOLD_FACT, "wb") as f:
    df.to_csv(f, index=False)
print("Gold (fato) salva em:", GOLD_FACT)

Gold (fato) salva em: gs://meu-bucket-premier/gold/epl/player_stats_all_time/player_stats_all_time_gold.csv


In [12]:
# ---------- salvar DIM_players única (Player, cpf_mascarado) ----------
dim = (
    df[[col_player, "cpf_mascarado"]]
    .drop_duplicates()
    .sort_values(col_player)
    .reset_index(drop=True)
)
with fs.open(GOLD_DIM, "wb") as f:
    dim.to_csv(f, index=False)
print("Gold (dim_players) salva em:", GOLD_DIM)

Gold (dim_players) salva em: gs://meu-bucket-premier/gold/epl/dim_players/dim_players.csv


In [14]:
df.head()

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides,_ingestion_ts,_dataset,cpf,cpf_mascarado,_processing_ts
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,489.0,2,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time,211.407.129-42,***********-42,2025-08-13T18:59:59+00:00
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time,784.169.237-44,***********-44,2025-08-13T18:59:59+00:00
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0,0,0,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time,458.986.891-17,***********-17,2025-08-13T18:59:59+00:00
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,0.0,23,0,125,8.0,2025-08-13T14:39:34+00:00,player_stats_all_time,231.620.348-50,***********-50,2025-08-13T18:59:59+00:00
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,0.0,2,0,9,0.0,2025-08-13T14:39:34+00:00,player_stats_all_time,226.387.468-27,***********-27,2025-08-13T18:59:59+00:00
