# Running BigQuery ML with Transformations

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
sb.set()

### The Data

In [4]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [9]:
%%bigquery earnings_data
SELECT *
FROM `crazy-hippo-01.clv.earnings_per_year`

AttributeError: 'BigQueryReadGrpcTransport' object has no attribute 'channel'

In [21]:
earnings_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,Private,297847,9th,5,Married-civ-spouse,Other-service,Wife,Black,Female,3411,0,34,United-States,<=50K
1,72,Private,74141,9th,5,Married-civ-spouse,Exec-managerial,Wife,Asian-Pac-Islander,Female,0,0,48,United-States,>50K
2,45,Private,178215,9th,5,Married-civ-spouse,Machine-op-inspct,Wife,White,Female,0,0,40,United-States,>50K
3,31,Private,86958,9th,5,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
4,55,Private,176012,9th,5,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,23,United-States,<=50K


### Training a model in BQML

#### Using Logistic Regression and adding a transformation to the data.

In [22]:
%%bigquery 
CREATE OR REPLACE MODEL clv.earnings_model
TRANSFORM(ML.FEATURE_CROSS(STRUCT(marital_status, relationship)) as cross_relationship,
            * EXCEPT(fnlwgt))
OPTIONS(input_label_cols=['income'], model_type='logistic_reg')
AS
SELECT *
FROM
  `crazy-hippo-01.clv.earnings_per_year`

The transformation step is stored within the model and will be applied to the data when doing predictions.

#### Use the <b>ML.EVALUATE</b> function to evaluate model metrics. 

In [18]:
%%bigquery
CREATE OR REPLACE TABLE clv.earnings_evaluation AS (
SELECT
  *
FROM
  ML.EVALUATE(MODEL `crazy-hippo-01.clv.earnings_model`)
)

#### Let us generate som samples to predit on

In [13]:
%%bigquery
CREATE or REPLACE TABLE `clv.prediction_sample`
AS (
SELECT *
FROM `crazy-hippo-01.clv.earnings_per_year`
WHERE RAND() < 0.0005 
)

#### Loading data from BQ and see how the data looks. 

In [14]:
%%bigquery prediction_data
SELECT * 
FROM `crazy-hippo-01.clv.prediction_sample`

In [15]:
prediction_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,32,Private,259719,Some-college,10,Divorced,Handlers-cleaners,Unmarried,Black,Male,0,0,40,Nicaragua,<=50K
1,42,Private,222011,Some-college,10,Divorced,Sales,Unmarried,White,Female,0,0,43,United-States,<=50K
2,40,Private,106698,Assoc-acdm,12,Divorced,Transport-moving,Unmarried,White,Female,0,0,40,United-States,<=50K
3,40,Private,141537,10th,6,Divorced,Machine-op-inspct,Not-in-family,Black,Female,0,0,40,United-States,<=50K
4,40,Local-gov,183765,Assoc-acdm,12,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,>50K
5,20,Self-emp-inc,95997,HS-grad,9,Never-married,Farming-fishing,Own-child,White,Male,0,0,70,United-States,<=50K
6,22,Private,139190,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,50,United-States,<=50K
7,19,Private,87497,11th,7,Never-married,Transport-moving,Other-relative,White,Male,0,0,10,United-States,<=50K
8,20,Private,375698,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,15,United-States,<=50K
9,24,Private,112854,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,16,United-States,<=50K


#### Batch Predictions

In [12]:
%%bigquery
CREATE TABLE clv.earnings_predictions (
SELECT *
FROM
  ML.PREDICT(MODEL `crazy-hippo-01.clv.earnings_model`,
    (
    SELECT
      *
    FROM
      `crazy-hippo-01.clv.prediction_sample`))
)

Unnamed: 0,predicted_income,predicted_income_probs,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,<=50K,"[{'label': ' >50K', 'prob': 0.0064679070900027...",18,?,113185,11th,7,Never-married,?,Own-child,White,Male,0,0,25,United-States,<=50K
1,<=50K,"[{'label': ' >50K', 'prob': 0.0130025858367740...",22,Private,267174,HS-grad,9,Never-married,Handlers-cleaners,Own-child,Black,Male,0,0,40,United-States,<=50K
2,<=50K,"[{'label': ' >50K', 'prob': 0.0352982720717686...",27,Self-emp-not-inc,107846,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,30,United-States,<=50K
3,>50K,"[{'label': ' >50K', 'prob': 0.519380971495562}...",81,Self-emp-not-inc,136063,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,30,United-States,<=50K
4,<=50K,"[{'label': ' >50K', 'prob': 0.1626030121338418...",58,Private,234213,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,14344,0,48,United-States,>50K
5,>50K,"[{'label': ' >50K', 'prob': 0.8122983535348138...",47,Local-gov,36169,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
6,<=50K,"[{'label': ' >50K', 'prob': 0.1808120551673187...",27,Private,128730,Assoc-voc,11,Never-married,Exec-managerial,Not-in-family,White,Male,10520,0,65,Greece,>50K
7,<=50K,"[{'label': ' >50K', 'prob': 0.1679548620335508...",43,Federal-gov,92775,Assoc-voc,11,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
8,<=50K,"[{'label': ' >50K', 'prob': 0.0547894170687177...",69,?,180187,Assoc-acdm,12,Widowed,?,Not-in-family,White,Female,0,0,6,Italy,<=50K
9,<=50K,"[{'label': ' >50K', 'prob': 0.4647326728675621...",62,Private,159908,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,38,United-States,>50K
