# Running BigQuery ML

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
sb.set()

In [None]:
!pip install --upgrade google-cloud-bigquery

### The Data

In [1]:
%%bigquery earnings_data
SELECT *
FROM crazy-hippo-01.earnings_ml.census_data

Query complete after 0.01s: 100%|██████████| 1/1 [00:00<00:00, 732.76query/s] 
Downloading: 100%|██████████| 32461/32461 [00:02<00:00, 15712.32rows/s]


In [3]:
earnings_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,65,?,293385,Preschool,1,Married-civ-spouse,?,Husband,Black,Male,0,0,30,United-States,<=50K
1,64,?,140237,Preschool,1,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K
2,52,?,248113,Preschool,1,Married-spouse-absent,?,Other-relative,White,Male,0,0,40,Mexico,<=50K
3,54,?,148657,Preschool,1,Married-civ-spouse,?,Wife,White,Female,0,0,40,Mexico,<=50K
4,39,?,362685,Preschool,1,Widowed,?,Not-in-family,White,Female,0,0,20,El-Salvador,<=50K


### Training a model in BigQuery ML

#### Using Logistic Regression for this example

In [22]:
%%bigquery 
CREATE OR REPLACE MODEL earnings_ml.earnings_model
OPTIONS(input_label_cols=['income'], model_type='logistic_reg')
AS
SELECT *
FROM
  `crazy-hippo-01.earnings_ml.census_data`

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1270.62query/s]                        


#### The <b>ML.FEATURE_INFO</b> function allows you to see information about the input features used to train a model.

In [6]:
%%bigquery 
SELECT * 
FROM ML.FEATURE_INFO(MODEL `crazy-hippo-01.earnings_ml.earnings_model`)

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 705.22query/s]                         
Downloading: 100%|██████████| 14/14 [00:01<00:00,  8.77rows/s]


Unnamed: 0,input,min,max,mean,median,stddev,category_count,null_count
0,age,17.0,90.0,38.58001,37.0,13.667871,,0.0
1,workclass,,,,,,9.0,
2,fnlwgt,13769.0,1484705.0,189622.921468,177305.0,105574.934549,,0.0
3,education,,,,,,16.0,
4,education_num,1.0,16.0,10.074694,10.0,2.574967,,0.0
5,marital_status,,,,,,7.0,
6,occupation,,,,,,15.0,
7,relationship,,,,,,6.0,
8,race,,,,,,5.0,
9,sex,,,,,,2.0,


#### Use the <b>ML.EVALUATE</b> function to evaluate model metrics. 

In [23]:
%%bigquery
SELECT
  *
FROM
  ML.EVALUATE(MODEL `crazy-hippo-01.earnings_ml.earnings_model`)

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 388.11query/s]                          
Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.55s/rows]


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.709845,0.540434,0.838483,0.613662,0.347407,0.8898


#### Use the <b>ML.CONFUSION_MATRIX</b> function to return a confusion matrix for the given logistic regression model and input data. 

In [24]:
%%bigquery
SELECT
  *
FROM
  ML.CONFUSION_MATRIX(MODEL `crazy-hippo-01.earnings_ml.earnings_model`,
  (
    SELECT
      *
    FROM
      `crazy-hippo-01.earnings_ml.census_data`))

Query complete after 0.00s: 100%|██████████| 4/4 [00:00<00:00, 1295.74query/s]                        
Downloading: 100%|██████████| 2/2 [00:01<00:00,  1.26rows/s]


Unnamed: 0,expected_label,___50K,__50K
0,<=50K,23053,1588
1,>50K,3522,4298


#### The <b>ML.WEIGHTS</b> function allows you to see the underlying weights used by a model during prediction.

In [10]:
%%bigquery
SELECT * 
FROM ML.WEIGHTS(MODEL `crazy-hippo-01.earnings_ml.earnings_model`
)

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 538.01query/s]                          
Downloading: 100%|██████████| 15/15 [00:01<00:00,  9.48rows/s]


Unnamed: 0,processed_input,weight,category_weights
0,age,0.01989957,[]
1,workclass,,"[{'category': ' Local-gov', 'weight': -0.22264..."
2,fnlwgt,3.201723e-07,[]
3,education,,"[{'category': ' Assoc-acdm', 'weight': -0.1959..."
4,education_num,0.123992,[]
5,marital_status,,"[{'category': ' Separated', 'weight': -0.60850..."
6,occupation,,"[{'category': ' ?', 'weight': -0.4776070552043..."
7,relationship,,"[{'category': ' Wife', 'weight': 0.92879077865..."
8,race,,"[{'category': ' White', 'weight': -0.166867719..."
9,sex,,"[{'category': ' Male', 'weight': -0.0538525515..."


## Batch Predictions 

#### Let us generate som samples to predit on

In [15]:
%%bigquery
CREATE or REPLACE TABLE `earnings_ml.prediction_sample`
AS (
SELECT * EXCEPT(income)
FROM `crazy-hippo-01.earnings_ml.census_data`
WHERE RAND() < 0.0005 
)

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1064.36query/s]                        


#### Loading data from BQ and see how the data looks. 

In [25]:
%%bigquery prediction_data
SELECT * 
FROM `crazy-hippo-01.earnings_ml.prediction_sample`


Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 922.84query/s] 
Downloading: 100%|██████████| 13/13 [00:01<00:00,  7.83rows/s]


In [26]:
prediction_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,31,Private,339482,1st-4th,2,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,Mexico
1,43,Private,193672,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
2,50,Private,228238,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
3,59,Self-emp-inc,200453,Masters,14,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
4,36,Local-gov,241998,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1672,50,United-States
5,28,Local-gov,180271,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,65,United-States
6,30,Private,54608,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States
7,20,Private,190227,Masters,14,Never-married,Exec-managerial,Own-child,White,Male,0,0,25,United-States
8,40,Private,111829,Masters,14,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States
9,26,Private,60722,Bachelors,13,Never-married,Prof-specialty,Own-child,Asian-Pac-Islander,Female,0,0,40,United-States


#### Batch Predictions

In [27]:
%%bigquery
SELECT *
FROM
  ML.PREDICT(MODEL `crazy-hippo-01.earnings_ml.earnings_model`,
    (
    SELECT
      *
    FROM
      `crazy-hippo-01.earnings_ml.prediction_sample`))

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1044.66query/s]                        
Downloading: 100%|██████████| 13/13 [00:01<00:00,  9.78rows/s]


Unnamed: 0,predicted_income,predicted_income_probs,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,<=50K,"[{'label': ' >50K', 'prob': 0.0456681564716456...",31,Private,339482,1st-4th,2,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,Mexico
1,<=50K,"[{'label': ' >50K', 'prob': 0.2325395975025235...",43,Private,193672,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
2,<=50K,"[{'label': ' >50K', 'prob': 0.3166020444596405...",50,Private,228238,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
3,>50K,"[{'label': ' >50K', 'prob': 0.827978478840053}...",59,Self-emp-inc,200453,Masters,14,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
4,>50K,"[{'label': ' >50K', 'prob': 0.800726724717813}...",36,Local-gov,241998,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1672,50,United-States
5,<=50K,"[{'label': ' >50K', 'prob': 0.4655124567469922...",28,Local-gov,180271,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,65,United-States
6,<=50K,"[{'label': ' >50K', 'prob': 0.2578465300021020...",30,Private,54608,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States
7,<=50K,"[{'label': ' >50K', 'prob': 0.1599302900674899...",20,Private,190227,Masters,14,Never-married,Exec-managerial,Own-child,White,Male,0,0,25,United-States
8,<=50K,"[{'label': ' >50K', 'prob': 0.0941935476392710...",40,Private,111829,Masters,14,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States
9,<=50K,"[{'label': ' >50K', 'prob': 0.0754038692332926...",26,Private,60722,Bachelors,13,Never-married,Prof-specialty,Own-child,Asian-Pac-Islander,Female,0,0,40,United-States
