## Machine Learning Models
We build a model that predicts which visitors, that visited its website on a certain day, are more likely to convert (buy something) in the next 30 days. We use the AI Notebook service of the Google Cloud Platform to develop the model.


## Deployment and getting predictions
We deploy our model so that it can be used to get predictions. We host the model in our Google Cloud Platform project and deploy our model using the AI Platform prediction service.

In [None]:
! pip install xgboost

In [21]:
# !pip install google-cloud-bigquery --user --use-feature=2020-resolver

In [24]:
# Enable the required APIs
# In order to use AI Platform, confirm that the required APIs are enabled:

!gcloud services enable ml.googleapis.com
!gcloud services enable compute.googleapis.com

In [25]:
# Create a storage bucket
BUCKET_NAME ='winged-tower-295515-bucket'

!gsutil mb gs://$BUCKET_NAME/

Creating gs://winged-tower-295515-bucket/...


In [26]:
!mkdir crystal_training

mkdir: cannot create directory ‘crystal_training’: File exists


In [27]:
!touch ./crystal_training/__init__.py

## BigQuery API Client Libraries
Setting up authentication for BigQuery

In [None]:
#Create the service account.
# https://cloud.google.com/bigquery/docs/reference/libraries#command-line

# !gcloud iam service-accounts create crystal_service_acc

In [None]:
# Grant permissions to the service account
# !gcloud projects add-iam-policy-binding project winged-tower-295515 --member="serviceAccount:crystal_service_acc@winged-tower-295515.iam.gserviceaccount.com" --role="owner"

In [None]:
# Generate the key file. 

# !gcloud iam service-accounts keys create key_file_bq_crystal.json --iam-account=crystal_service_acc@winged-tower-295515.iam.gserviceaccount.com

# Creating the Training Job
Reading the data from BigQuery and Exploring Dataset.


In [224]:
%%writefile ./crystal_training/train.py
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import  recall_score, precision_score,accuracy_score, average_precision_score

import google.cloud.bigquery.magics
google.cloud.bigquery.magics.context.use_bqstorage_api = True

# from xgboost import XGBClassifier

import pickle
from google.cloud import bigquery
from google.cloud import storage


#create the BigQuery client.
client_bq = bigquery.Client()

# Running queries

dataset_query="""
SELECT
  fullVisitorId, date, 
  visitStartTime, 
  visitNumber,
  IF(totals.transactions IS NULL, 0, 1) AS purchase,  
  
  COUNTIF(IF(totals.transactions IS NULL, FALSE, TRUE))
  OVER (
    PARTITION BY fullVisitorId 
    ORDER BY visitStartTime
    RANGE BETWEEN CURRENT ROW AND 2592000 FOLLOWING
  ) AS total_purchases_next30days,
  
  IFNULL(totals.bounces, 0) AS bounces,
  IFNULL(totals.hits, 0) AS hits,
  IFNULL(totals.pageviews, 0) AS pageviews,
  IFNULL(totals.timeOnSite, 0) AS time_on_site,
  IFNULL(totals.visits, 0) AS visits_interaction,

  device.isMobile AS is_mobile,
  trafficSource.isTrueDirect AS is_source_direct,
  IFNULL(geoNetwork.country, "") AS country,
  channelGrouping


FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170731'"""


# Make an API request.
query_job = client_bq.query(dataset_query)
records = [dict(row) for row in query_job]
sample_df= pd.DataFrame(records)

# Preprocess
label= sample_df['total_purchases_next30days']>0
sample_df.insert(7,'label',label)

sample_df.is_source_direct.replace(to_replace= np.nan, value= False, inplace=True)

sample_df = pd.get_dummies(sample_df, columns=['channelGrouping'], prefix='channelGrouping_type')
sample_df.drop(columns=['channelGrouping_type_(Other)'], inplace=True)

#Split the data to trainset and testset
y= sample_df['label']
X= sample_df.drop(columns=['label','purchase','total_purchases_next30days','fullVisitorId','visitStartTime','date'], inplace=False)
X_trainVal, X_test, y_trainVal, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle= True, stratify= y)


# balance the trainingset with manual UnderSampling

Xy_trainVal_unblc = pd.concat([X_trainVal, y_trainVal], axis = 1) 
Xy_trainVal_unblc.reset_index(inplace=True, drop=True) 

pos_index = Xy_trainVal_unblc[Xy_trainVal_unblc['label']==1].index
positive_class= Xy_trainVal_unblc.iloc[pos_index].copy()
pos_len = len(positive_class)
#remove positives
Xy_trainVal_unblc.drop(pos_index, inplace=True)
neg_len = len(Xy_trainVal_unblc)

neg_coefficient = 1 

#Return a random sample of items from an axis of object.
negative_class = Xy_trainVal_unblc.sample(n= pos_len, replace='False', random_state = 42)#.reset_index(drop=True)
# balanced_df= pd.concat([positive_class, negative_class],ignore_index=True)

Xy_trainVal_blc= pd.concat([positive_class, negative_class],ignore_index=True)

# Just shuffling
Xy_trainVal_blc = shuffle(Xy_trainVal_blc,  random_state=42)
Xy_trainVal_blc = shuffle(Xy_trainVal_blc,  random_state=1)
Xy_trainVal_blc = shuffle(Xy_trainVal_blc,  random_state=5)
Xy_trainVal_blc = shuffle(Xy_trainVal_blc,  random_state=10)
Xy_trainVal_blc.reset_index(inplace=True, drop=True) 

y_trainVal_blc = Xy_trainVal_blc['label'].copy()
y_trainVal_blc = pd.DataFrame(y_trainVal_blc, columns=['label'])
X_trainVal_blc = Xy_trainVal_blc.drop(columns=['label'], inplace=False)
#end of balancing

#list of all countries in the training set
all_countries_array= X_trainVal_blc['country'].unique()
all_countries = set(all_countries_array)

# find the list of counrtries based on the number of users who have made a purchase in the next 30 days.
purchased_countries = X_trainVal_blc[y_trainVal_blc['label']==1]['country'].value_counts()
# purchased_countries is a seri type. we need its indices = countries' names 

# just top 10% countries: round(0.1*len(all_countries))
top_countries_series = purchased_countries.index[0:round(0.1*len(all_countries))]
top_countries = set(top_countries_series)

#replace other countries in trainset
other_countries_in_trainset = list(all_countries - top_countries)

X_trainVal_blc.replace(to_replace= other_countries_in_trainset, value='others', inplace=True)

#replace other countries in trainset
all_countries_array= X_test['country'].unique()
all_countries_test = set(all_countries_array)
other_countries_in_testset = list(all_countries_test - top_countries)

X_test.replace(to_replace= other_countries_in_testset, value='others', inplace=True)

X_trainVal_blc = pd.get_dummies(X_trainVal_blc, columns=['country'], prefix='country_is')
X_trainVal_blc.drop(columns=['country_is_others'], inplace=True)

X_test = pd.get_dummies(X_test, columns=['country'], prefix='country_is')
X_test.drop(columns=['country_is_others'], inplace=True)


# ML Model

y_trainVal_blc = y_trainVal_blc.astype('int')
y_test = y_test.astype('int')


#Random Forest
param_grid = {'n_estimators':[1000], 'criterion': ['gini'], 'min_samples_leaf':[10],
              'min_samples_split': [100], 'max_features':['sqrt'],
              'class_weight':['balanced'], 'random_state':[0], 'bootstrap':[True], 'oob_score':[True]}


cv_scoring = {'average_precision_PR_AUC':'average_precision', 'AUC': 'roc_auc','recall':'recall', 'precision':'precision','accuracy':'accuracy'}


grid = GridSearchCV(RandomForestClassifier(), param_grid= param_grid, cv=5, scoring= cv_scoring, 
                    refit='average_precision_PR_AUC', return_train_score=True, n_jobs= -1, verbose=1)

grid_result = grid.fit(X_trainVal_blc, y_trainVal_blc)
                         
rf_refit = grid.best_estimator_.fit(X_trainVal_blc,y_trainVal_blc)

#-----------------------------
# XGBClassifier
# param_grid = {'max_depth': [3], 'n_estimators': [1200], 'learning_rate': [ 0.1],
#               'gamma': [ 1],'min_child_weight': [1], 'subsample': [0.6],
#               'colsample_bytree': [0.6],'random_state':[0]}

# cv_scoring = {'average_precision_PR_AUC':'average_precision', 'AUC': 'roc_auc','recall':'recall', 'precision':'precision','accuracy':'accuracy'}

# xgb = XGBClassifier(objective= 'binary:logistic',seed=0)

# grid_x = GridSearchCV(xgb, param_grid= param_grid, cv=3, scoring= cv_scoring, 
#                     refit='average_precision_PR_AUC', return_train_score=True, n_jobs= -1, verbose=1)

# grid_result = grid_x.fit(X_trainVal_blc, y_trainVal_blc)
                         
# xgb_refit = grid_x.best_estimator_.fit(X_trainVal_blc,y_trainVal_blc)
#---------------------------

# Create the model file
# It is required to name the model file "model.pkl" if you are using pickle
model_filename = "model.pkl"
with open(model_filename, "wb") as model_file:
    pickle.dump(rf_refit, model_file)

# Upload the model to Cloud Storage
bucket_name = 'winged-tower-295515-bucket'
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(model_filename)
blob.upload_from_filename(model_filename)


Overwriting ./crystal_training/train.py


# Submit the training job

In [225]:
import time

# Define a timestamped job name
JOB_NAME = "crystal_training_{}".format(int(time.time()))

# JOB_NAME = "job_training_{}".format(1)

In [226]:
# Submit the training job:
!gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir gs://$BUCKET_NAME/crystal_job_dir \
  --package-path ./crystal_training \
  --module-name crystal_training.train \
  --region us-central1 \
  --runtime-version=2.3 \
  --python-version=3.5 \
  --scale-tier BASIC \
  --stream-logs \
  -- \
  --bucket-name $BUCKET_NAME

Job [crystal_training_1608014653] submitted successfully.
INFO	2020-12-15 06:44:29 +0000	service		Validating job requirements...
INFO	2020-12-15 06:44:29 +0000	service		Job creation request has been successfully validated.
INFO	2020-12-15 06:44:30 +0000	service		Job crystal_training_1608014653 is queued.
INFO	2020-12-15 06:44:31 +0000	service		Waiting for job to be provisioned.
INFO	2020-12-15 06:47:12 +0000	service		Waiting for training program to start.
INFO	2020-12-15 06:47:12 +0000	service		Job is preparing.
INFO	2020-12-15 06:52:21 +0000	master-replica-0		Running task with arguments: --cluster={"chief": ["cmle-training-master-34828f3dd9-0:2222"]} --task={"type": "chief", "index": 0} --job={
INFO	2020-12-15 06:52:21 +0000	master-replica-0		  "package_uris": ["gs://winged-tower-295515-bucket/crystal_job_dir/packages/3589a21b146763b8de17ee39e7e1033387e95bc49eff0d066f5088d4cf76ca0a/crystal_training-0.0.0.tar.gz"],
INFO	2020-12-15 06:52:21 +0000	master-replica-0		  "python_module": "cr

# Verify model file in Cloud Storage

In [227]:
!gsutil ls gs://$BUCKET_NAME/

gs://winged-tower-295515-bucket/imbalanced_learn-0.7.0-py3-none-any.whl
gs://winged-tower-295515-bucket/model.pkl
gs://winged-tower-295515-bucket/crystal_job_dir/


# Serve the model

In [230]:
MODEL_NAME = "CrystalPredictorModel"
VERSION_NAME = "crystal_predictor_{}".format(int(time.time()))

In [231]:
# Create the model in AI Platform:

!gcloud ai-platform models create $MODEL_NAME --regions us-central1

Using endpoint [https://ml.googleapis.com/]
Created ml engine model [projects/winged-tower-295515/models/CrystalPredictorModel].


In [235]:
# Create a version that points to your model file in Cloud Storage:

!gcloud ai-platform versions create $VERSION_NAME \
  --model=$MODEL_NAME \
  --framework=scikit-learn \
  --origin=gs://$BUCKET_NAME/ \
  --python-version=3.7 \
  --runtime-version=2.1


Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


# Make predictions
## Format data for prediction

In [236]:
# Define a name for the input file
INPUT_FILE = "./crystal_training/input.json"

In [293]:
%%writefile $INPUT_FILE
{
    "instances": [
        {"visitNumber":1,"bounces":1,"hits":1,"pageviews":1,"time_on_site":0,"visits_interaction":1,"is_mobile":true,"is_source_direct":false,"channelGrouping_type_Affiliates":0,"channelGrouping_type_Direct":0,"channelGrouping_type_Display":0,"channelGrouping_type_Organic Search":1,"channelGrouping_type_Paid Search":0,"channelGrouping_type_Referral":0,"channelGrouping_type_Social":0,"country_is_Canada":0,"country_is_Chile":0,"country_is_Hong Kong":0,"country_is_Indonesia":0,"country_is_Japan":0,"country_is_Singapore":0,"country_is_Switzerland":0,"country_is_Thailand":0,"country_is_United Kingdom":0,"country_is_United States":1,"country_is_Venezuela":0},
        {"visitNumber":1,"bounces":0,"hits":7,"pageviews":7,"time_on_site":128,"visits_interaction":1,"is_mobile":true,"is_source_direct":true,"channelGrouping_type_Affiliates":0,"channelGrouping_type_Direct":1,"channelGrouping_type_Display":0,"channelGrouping_type_Organic Search":0,"channelGrouping_type_Paid Search":0,"channelGrouping_type_Referral":0,"channelGrouping_type_Social":0,"country_is_Canada":0,"country_is_Chile":0,"country_is_Hong Kong":0,"country_is_Indonesia":0,"country_is_Japan":0,"country_is_Singapore":0,"country_is_Switzerland":0,"country_is_Thailand":0,"country_is_United Kingdom":1,"country_is_United States":0,"country_is_Venezuela":0}
    ]
}

Overwriting ./crystal_training/input.json


In [270]:
# %%writefile $INPUT_FILE
# {
#     "instances": [
#         [1, 0, 16, 11, 246, 1, True, False, 0, 1, 0, 0 ,0, 0, 0 ,0 ,0, 0 ,0 ,0, 0, 0, 0, 0, 1, 0]
#     ]
# }

Overwriting ./crystal_training/input.json


# Send the online prediction request

In [294]:
!gcloud ai-platform predict --model $MODEL_NAME --version \
  $VERSION_NAME --json-request $INPUT_FILE

# !gcloud ai-platform local predict --model-dir 'gs://winged-tower-bucket/job_dir/packages/c85e323f4610e652849111fe246050ac8246c0a3358073f7f2a2352eb76776de/Training-0.0.0.tar.gz'  \
#   --json-instances $INPUT_FILE

Using endpoint [https://ml.googleapis.com/]
{
  "error": "Prediction failed: Exception during sklearn prediction: float() argument must be a string or a number, not 'dict'"
}


In [255]:
!gcloud ai-platform predict --help

[m[1mNAME[m
    gcloud ai-platform predict - run AI Platform online prediction

[m[1mSYNOPSIS[m
    [1mgcloud ai-platform predict[m [1m--model[m=[4mMODEL[m
        ([1m--json-instances[m=[4mJSON_INSTANCES[m | [1m--json-request[m=[4mJSON_REQUEST[m
          | [1m--text-instances[m=[4mTEXT_INSTANCES[m) [[1m--region[m=[4mREGION[m]
        [[1m--signature-name[m=[4mSIGNATURE_NAME[m] [[1m--version[m=[4mVERSION[m]
        [[4mGCLOUD_WIDE_FLAG ...[m]

[m[1mDESCRIPTION[m
    [1mgcloud ai-platform predict[m sends a prediction request to AI Platform for
    the given instances. This command will read up to 100 instances, though the
    service itself will accept instances up to the payload limit size
    (currently, 1.5MB). If you are predicting on more instances, you should use
    batch prediction via

        $ gcloud ai-platform jobs submit prediction.

[m[1mREQUIRED FLAGS[m
     [1m--model[m=[4mMODEL[m
        Name of the model.

     Exactl

# Batch prediction
read the last day of data as a batch

In [239]:
#create the BigQuery client.
client_bq = bigquery.Client()

# Running queries

dataset_query="""SELECT
  fullVisitorId, date, 
  visitStartTime, 
  visitNumber,
  IF(totals.transactions IS NULL, 0, 1) AS purchase,  
  
  COUNTIF(IF(totals.transactions IS NULL, FALSE, TRUE))
  OVER (
    PARTITION BY fullVisitorId 
    ORDER BY visitStartTime
    RANGE BETWEEN CURRENT ROW AND 2592000 FOLLOWING
  ) AS total_purchases_next30days,
  
  IFNULL(totals.bounces, 0) AS bounces,
  IFNULL(totals.hits, 0) AS hits,
  IFNULL(totals.pageviews, 0) AS pageviews,
  IFNULL(totals.timeOnSite, 0) AS time_on_site,
  IFNULL(totals.visits, 0) AS visits_interaction,

  device.isMobile AS is_mobile,
  trafficSource.isTrueDirect AS is_source_direct,
  IFNULL(geoNetwork.country, "") AS country,
  channelGrouping
  
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`"""


 # Make an API request.
query_job = client_bq.query(dataset_query)
records = [dict(row) for row in query_job]
batch_df= pd.DataFrame(records)


In [240]:
batch_df.shape

(2556, 15)

# Preprocess Batch Data

In [241]:
label= batch_df['total_purchases_next30days']>0
batch_df.insert(7,'label',label)

# Replace NAN values with False
batch_df.is_source_direct.replace(to_replace= np.nan, value= False, inplace=True)
batch_df.is_source_direct.unique()

array([False,  True])

In [242]:
#Convert categorical columns to dummies
batch_df = pd.get_dummies(batch_df, columns=['channelGrouping'], prefix='channelGrouping_type')
# batch_df.drop(columns=['channelGrouping_type_(Other)'], inplace=True)

In [243]:
y_batch= batch_df['label']
y_batch = y_batch.astype('int')

X_batch= batch_df.drop(columns=['label','purchase','total_purchases_next30days','fullVisitorId','visitStartTime','date'], inplace=False)

In [244]:
#Convert categorical 'country' column to dummies
all_countries_array= X_batch['country'].unique()
all_countries_test = set(all_countries_array)

top_countries= {'Venezuela','Japan','United Kingdom','Canada','Indonesia','Thailand','Singapore','Chile','United States','Switzerland','Hong Kong'}
other_countries_in_testset = list(all_countries_test - top_countries)

X_batch.replace(to_replace= other_countries_in_testset, value='others', inplace=True)

X_batch = pd.get_dummies(X_batch, columns=['country'], prefix='country_is')
X_batch.drop(columns=['country_is_others'], inplace=True)

In [245]:
X_batch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   visitNumber                          2556 non-null   int64
 1   bounces                              2556 non-null   int64
 2   hits                                 2556 non-null   int64
 3   pageviews                            2556 non-null   int64
 4   time_on_site                         2556 non-null   int64
 5   visits_interaction                   2556 non-null   int64
 6   is_mobile                            2556 non-null   bool 
 7   is_source_direct                     2556 non-null   bool 
 8   channelGrouping_type_Affiliates      2556 non-null   uint8
 9   channelGrouping_type_Direct          2556 non-null   uint8
 10  channelGrouping_type_Display         2556 non-null   uint8
 11  channelGrouping_type_Organic Search  2556 non-null   uin

In [281]:
X_batch[0:3].to_json(orient='index')

'{"0":{"visitNumber":1,"bounces":1,"hits":1,"pageviews":1,"time_on_site":0,"visits_interaction":1,"is_mobile":true,"is_source_direct":false,"channelGrouping_type_Affiliates":0,"channelGrouping_type_Direct":0,"channelGrouping_type_Display":0,"channelGrouping_type_Organic Search":1,"channelGrouping_type_Paid Search":0,"channelGrouping_type_Referral":0,"channelGrouping_type_Social":0,"country_is_Canada":0,"country_is_Chile":0,"country_is_Hong Kong":0,"country_is_Indonesia":0,"country_is_Japan":0,"country_is_Singapore":0,"country_is_Switzerland":0,"country_is_Thailand":0,"country_is_United Kingdom":0,"country_is_United States":1,"country_is_Venezuela":0},"1":{"visitNumber":1,"bounces":0,"hits":7,"pageviews":7,"time_on_site":128,"visits_interaction":1,"is_mobile":true,"is_source_direct":true,"channelGrouping_type_Affiliates":0,"channelGrouping_type_Direct":1,"channelGrouping_type_Display":0,"channelGrouping_type_Organic Search":0,"channelGrouping_type_Paid Search":0,"channelGrouping_type_Re