# Example 1: Query a table with SQL and magic commands

In [1]:
# Install dependencies if needed:
# !pip install google-cloud-bigquery pandas

from google.cloud import bigquery
import pandas as pd
from typing import List, Optional
import numpy as np

# Initialize client (assumes GOOGLE_APPLICATION_CREDENTIALS is set)
client = bigquery.Client(project="olist-etl-pipeline")

def profile_df(df: pd.DataFrame,
               name: str = "",
               pk: Optional[List[str]] = None,
               sample_uniques: int = 20):
    """
    Quick EDA for one table.
    """
    print(f"\n{'='*80}\nTABLE: {name} — shape {df.shape}\n{'='*80}")
    display(df.head())

    # ⇢ dtypes
    print("\n--- column dtypes -----------------------------------")
    print(df.dtypes.sort_index())

    # ⇢ missing values
    print("\n--- missing values ----------------------------------")
    na = df.isna().sum().to_frame("missing_cnt")
    na["missing_pct"] = (na["missing_cnt"] / len(df) * 100).round(2)
    display(na.sort_values("missing_cnt", ascending=False))

    # ⇢ numeric summary
    num_cols = df.select_dtypes(include=np.number).columns
    if num_cols.any():
        print("\n--- numeric summary --------------------------------")
        display(df[num_cols].describe().T)

    # ⇢ unique counts (categoricals / ids)
    cat_cols = df.select_dtypes(include="object").columns
    if cat_cols.any():
        print("\n--- unique counts ----------------------------------")
        uniq = df[cat_cols].nunique().sort_values(ascending=False)
        display(uniq.to_frame("n_unique").head(20))

    # ⇢ duplicates on PK
    if pk:
        dup_cnt = df.duplicated(subset=pk).sum()
        print(f"\n--- duplicate check on {pk} ------------------------")
        print(f"Rows violating PK: {dup_cnt}")

TABLE = "olist-etl-pipeline.staging.customers"

query = f"SELECT * FROM `{TABLE}`"
df_customers = client.query(query).to_dataframe()
profile_df(df_customers, name="customers", pk=["customer_id"])


TABLE: customers — shape (99441, 5)


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,2201362e68992f654942dc0067c1b716,f7d7fc0a59ef4363fdce6e3aa069d498,69900,rio branco,AC
1,31dbc13addc753e210692eacaea065e4,5dbba6c01268a8ad43f79157bf4454a0,69900,rio branco,AC
2,dad907e170748a35ef4e92238b7308f3,36b1c0516f123351ffa87430416dcae5,69900,rio branco,AC
3,888d2ebe1af2a8c93c75dae5dfc23719,721d1092e1a6460c67e6a0e691d899a3,69900,rio branco,AC
4,8a0108267d9258a0ec9f74381bc9b0de,7a2dc4682890550ebe3b8befcea3d55c,69900,rio branco,AC



--- column dtypes -----------------------------------
customer_city               object
customer_id                 object
customer_state              object
customer_unique_id          object
customer_zip_code_prefix     Int64
dtype: object

--- missing values ----------------------------------


Unnamed: 0,missing_cnt,missing_pct
customer_id,0,0.0
customer_unique_id,0,0.0
customer_zip_code_prefix,0,0.0
customer_city,0,0.0
customer_state,0,0.0



--- numeric summary --------------------------------


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_zip_code_prefix,99441.0,35137.474583,29797.938996,1003.0,11347.0,24416.0,58900.0,99990.0



--- unique counts ----------------------------------


Unnamed: 0,n_unique
customer_id,99441
customer_unique_id,96096
customer_city,4119
customer_state,27



--- duplicate check on ['customer_id'] ------------------------
Rows violating PK: 0
