Challenge lab - 1

In [11]:
%%bash
PROJECT_ID=$(gcloud config get-value project)
DATASET="challenge1"

bq --location=US mk -d \
    --description "fraud lab dataset" \
    "${PROJECT_ID}:${DATASET}" || true

BigQuery error in mk operation: Dataset 'qwiklabs-
gcp-00-46c4d2064c57:challenge1' already exists.


In [None]:
import os
PROJECT_ID = "qwiklabs-gcp-00-46c4d2064c57"
DATASET = "challenge1"
os.environ['PROJECT_ID'] = PROJECT_ID
os.environ['DATASET'] = DATASET

In [12]:
!bq load \
  --source_format=CSV \
  --skip_leading_rows=1 \
  --autodetect \
  {PROJECT_ID}:{DATASET}.fraud_raw \
  gs://labs.roitraining.com/data-to-ai-workshop/fraud_data_raw.csv

Waiting on bqjob_r43cc6b47a51eee66_0000019bc233579c_1 ... (2s) Current status: DONE   


In [27]:
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

q1 = f"""
SELECT DISTINCT Employment_Status
FROM `{PROJECT_ID}.{DATASET}.fraud_raw`
ORDER BY 1
"""
print(client.query(q1).to_dataframe())

  Employment_Status
0          Employed
1     Self-Employed
2        Unemployed


In [None]:
q2 = f"""
SELECT DISTINCT Device_Type
FROM `{PROJECT_ID}.{DATASET}.fraud_raw`
ORDER BY 1
"""
print(client.query(q2).to_dataframe())

  Device_Type
0     Desktop
1      Mobile
2      Tablet


In [None]:

#get distinct categories for Employment_Status and Device_Type
emp_query = f"""
SELECT DISTINCT Employment_Status
FROM `{PROJECT_ID}.{DATASET}.fraud_raw`
WHERE Employment_Status IS NOT NULL
"""
employment_statuses = [row.Employment_Status for row in client.query(emp_query)]

dev_query = f"""
SELECT DISTINCT Device_Type
FROM `{PROJECT_ID}.{DATASET}.fraud_raw`
WHERE Device_Type IS NOT NULL
"""
device_types = [row.Device_Type for row in client.query(dev_query)]

#for sql with dynamic one-hot encoding values, sanitize name
emp_sql = ",\n  ".join([
    f"IF(Employment_Status = '{val}', 1, 0) AS Employment_Status_{val.lower().replace('-', '_').replace(' ', '_')}"
    for val in employment_statuses
])

dev_sql = ",\n  ".join([
    f"IF(Device_Type = '{val}', 1, 0) AS Device_Type_{val.lower().replace('-', '_').replace(' ', '_')}"
    for val in device_types
])

#final sql payload
final_query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET}.fraud_training_data` AS
SELECT
  * EXCEPT(Employment_Status, Device_Type, Previous_Assistance_Received, Supporting_Doc_Verified),

  -- (a) dynamic one-hot
  {emp_sql},
  {dev_sql},

  -- (b) age bins
  IF(Age BETWEEN 18 AND 24, 1, 0) AS age_18_24,
  IF(Age BETWEEN 25 AND 34, 1, 0) AS age_25_34,
  IF(Age BETWEEN 35 AND 44, 1, 0) AS age_35_44,
  IF(Age BETWEEN 45 AND 54, 1, 0) AS age_45_54,
  IF(Age BETWEEN 55 AND 64, 1, 0) AS age_55_64,
  IF(Age >= 65, 1, 0) AS age_65_plus,

  -- (c) ratio
  SAFE_DIVIDE(Income, NULLIF(Amount_Requested, 0)) AS Income_to_Amount_Requested,

  -- (d) time since previous assistance (days)
  IF(
    Previous_Assistance_Received IS TRUE AND Previous_Assistance_Date IS NOT NULL,
    DATE_DIFF(Application_Date, Previous_Assistance_Date, DAY),
    NULL
  ) AS Time_Since_Previous_Assistance,

  -- (e) boolean -> 0/1, overwrite the original field
  CAST(Previous_Assistance_Received AS INT64) AS Previous_Assistance_Received,
  CAST(Supporting_Doc_Verified AS INT64) AS Supporting_Doc_Verified

FROM `{PROJECT_ID}.{DATASET}.fraud_raw`;
"""

client.query(final_query).result()
print("fraud_training_data created")


fraud_training_data created
