# Running BigQuery ML with Transformations

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
sb.set()

### The Data

In [1]:
%%bigquery earnings_data
SELECT *
FROM `crazy-hippo-01.clv.earnings_per_year`

In [2]:
earnings_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,Private,297847,9th,5,Married-civ-spouse,Other-service,Wife,Black,Female,3411,0,34,United-States,<=50K
1,72,Private,74141,9th,5,Married-civ-spouse,Exec-managerial,Wife,Asian-Pac-Islander,Female,0,0,48,United-States,>50K
2,45,Private,178215,9th,5,Married-civ-spouse,Machine-op-inspct,Wife,White,Female,0,0,40,United-States,>50K
3,31,Private,86958,9th,5,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
4,55,Private,176012,9th,5,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,23,United-States,<=50K


### Training a model in BQML

#### Using Logistic Regression and adding a transformation to the data.

In [4]:
%%bigquery 
CREATE OR REPLACE MODEL clv.earnings_model
TRANSFORM(ML.FEATURE_CROSS(STRUCT(marital_status, relationship)) as cross_relationship,
            * EXCEPT(fnlwgt))
OPTIONS(input_label_cols=['income'], model_type='logistic_reg')
AS
SELECT *
FROM
  `crazy-hippo-01.clv.earnings_per_year`

#### Use the <b>ML.EVALUATE</b> function to evaluate model metrics. 

In [5]:
%%bigquery
SELECT
  *
FROM
  ML.EVALUATE(MODEL `crazy-hippo-01.clv.earnings_model`)

Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.702218,0.541091,0.83661,0.611214,0.348612,0.888611


#### Let us generate som samples to predit on

In [None]:
%%bigquery
CREATE or REPLACE TABLE `clv.prediction_sample`
AS (
SELECT *
FROM `crazy-hippo-01.clv.earnings_per_year`
WHERE RAND() < 0.0005 
)

#### Loading data from BQ and see how the data looks. 

In [6]:
%%bigquery prediction_data
SELECT * 
FROM `crazy-hippo-01.clv.prediction_sample`

In [7]:
prediction_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,27,Private,160786,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,1902,40,United-States,>50K
1,46,Private,273629,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
2,36,Local-gov,61778,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,0,40,United-States,>50K
3,54,Private,215990,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
4,43,Private,397280,Assoc-acdm,12,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,>50K
5,26,Private,103700,Some-college,10,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
6,48,Private,232840,Some-college,10,Widowed,Adm-clerical,Unmarried,White,Female,0,0,43,United-States,<=50K
7,54,Private,240542,Some-college,10,Divorced,Sales,Unmarried,White,Female,0,0,48,United-States,<=50K
8,32,?,30499,Bachelors,13,Divorced,?,Unmarried,White,Female,0,0,32,United-States,<=50K
9,41,Private,106627,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,Black,Female,0,0,50,United-States,<=50K


#### Batch Predictions

In [None]:
%%bigquery
SELECT *
FROM
  ML.PREDICT(MODEL `crazy-hippo-01.clv.earnings_model`,
    (
    SELECT
      *
    FROM
      `crazy-hippo-01.clv.prediction_sample`))