In [3]:
# Config & BigQuery
from google.cloud import bigquery

# -----------------------
# Config
# -----------------------
PROJECT_ID = "infinite-mantra-480821-v7"
DATASET_ID = "telco_churn_ds"

SOURCE_TABLE = "customers_native"
FEATURE_VIEW = "v_features_with_split"

bq = bigquery.Client(project=PROJECT_ID)

src = f"{PROJECT_ID}.{DATASET_ID}.{SOURCE_TABLE}"
feat_view = f"{PROJECT_ID}.{DATASET_ID}.{FEATURE_VIEW}"

src, feat_view

('infinite-mantra-480821-v7.telco_churn_ds.customers_native',
 'infinite-mantra-480821-v7.telco_churn_ds.v_features_with_split')

In [4]:
#Creating feature and One-Hot Encoding
sql = f"""
CREATE OR REPLACE VIEW `{feat_view}` AS
WITH base AS (
  SELECT
    customerID,

    --  Label (robust to BOOL or 'Yes'/'No')
    CASE
      WHEN SAFE_CAST(Churn AS BOOL) IS NOT NULL THEN CASE WHEN Churn THEN 1 ELSE 0 END
      WHEN UPPER(CAST(Churn AS STRING)) = 'YES' THEN 1
      ELSE 0
    END AS label,

    --  Numeric features
    SAFE_CAST(tenure AS INT64) AS tenure,
    SAFE_CAST(MonthlyCharges AS FLOAT64) AS MonthlyCharges,
    SAFE_CAST(TotalCharges AS FLOAT64) AS TotalCharges,

    -- Normalize categoricals
    UPPER(CAST(Contract AS STRING)) AS Contract,
    UPPER(CAST(InternetService AS STRING)) AS InternetService,
    UPPER(CAST(PaymentMethod AS STRING)) AS PaymentMethod,

    -- Booleans â†’ 0/1
    CASE WHEN Partner THEN 1 ELSE 0 END AS Partner_1,
    CASE WHEN PaperlessBilling THEN 1 ELSE 0 END AS Paperless_1,
    CASE WHEN PhoneService THEN 1 ELSE 0 END AS PhoneService_1
  FROM `{src}`
),

one_hot AS (
  SELECT
    customerID,
    label,
    tenure,
    MonthlyCharges,
    TotalCharges,

    --  Contract
    CASE WHEN Contract = 'MONTH-TO-MONTH' THEN 1 ELSE 0 END AS Contract_mtm,
    CASE WHEN Contract = 'ONE YEAR' THEN 1 ELSE 0 END AS Contract_1yr,
    CASE WHEN Contract = 'TWO YEAR' THEN 1 ELSE 0 END AS Contract_2yr,

    --  Internet service
    CASE WHEN InternetService = 'FIBER OPTIC' THEN 1 ELSE 0 END AS Internet_Fiber,
    CASE WHEN InternetService = 'DSL' THEN 1 ELSE 0 END AS Internet_DSL,
    CASE WHEN InternetService = 'NO' THEN 1 ELSE 0 END AS Internet_None,

    --  Payment method
    CASE WHEN PaymentMethod LIKE '%ELECTRONIC CHECK%' THEN 1 ELSE 0 END AS Pay_ElectronicCheck,
    CASE WHEN PaymentMethod LIKE '%MAILED CHECK%' THEN 1 ELSE 0 END AS Pay_MailedCheck,
    CASE WHEN PaymentMethod LIKE '%CREDIT CARD%' THEN 1 ELSE 0 END AS Pay_CreditCard,
    CASE WHEN PaymentMethod LIKE '%BANK TRANSFER%' THEN 1 ELSE 0 END AS Pay_BankTransfer,

    Partner_1,
    Paperless_1,
    PhoneService_1
  FROM base
),

split AS (
  SELECT
    *,
    MOD(ABS(FARM_FINGERPRINT(CAST(customerID AS STRING))), 100) AS bucket
  FROM one_hot
)

SELECT
  *,
  CASE
    WHEN bucket < 70 THEN 'train'
    WHEN bucket < 85 THEN 'val'
    ELSE 'test'
  END AS split
FROM split;
"""
bq.query(sql).result()
print(" Feature view created:", feat_view)

 Feature view created: infinite-mantra-480821-v7.telco_churn_ds.v_features_with_split


In [5]:
# Checking Split 
# Row count
bq.query(f"""
SELECT split, COUNT(*) AS n
FROM `{feat_view}`
GROUP BY split
ORDER BY split
""").to_dataframe()

Unnamed: 0,split,n
0,test,1115
1,train,4949
2,val,979


In [8]:
# Label distribution (check imbalance)
bq.query(f"""
SELECT label, COUNT(*) AS n
FROM `{feat_view}`
GROUP BY label
""").to_dataframe()

Unnamed: 0,label,n
0,0,5174
1,1,1869
