In [2]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
import json
import os

from tqdm.auto import tqdm
tqdm.pandas(leave=False)

# Training

## configuration

In [3]:
PROJECT                  = "mlteam-ml-specialization-2021"
TRAIN_PATH               = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv"
EVAL_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv"
TEST_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/testset.csv"
MODEL_NAME               = "010_hptuning_input_l1"
BATCH_SIZE               = 30000 #1 epoch -> 3 steps
NUM_EPOCHS               = 100
SCANN_NUM_NEIGHBORS      = 100
#LEARNING_RATE            = 0.01
EMBEDDING_DIM            = 128
USER_FEATURES            = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"]
USER_INPUT_EMBEDDING_DIM = 8
FORCE                    = True

In [10]:
%%writefile config.yaml
trainingInput:
  hyperparameters:
    goal: MINIMIZE
    hyperparameterMetricTag: epoch_total_loss
    maxTrials: 2
    maxParallelTrials: 2
    enableTrialEarlyStopping: False
    params:
    - parameterName: learning-rate
      type: DISCRETE
      discreteValues:
      - 0.001
    - parameterName: product-input-embedding-dim
      type: DISCRETE
      discreteValues:
      - 128
    - parameterName: embedding-dim
      type: DISCRETE
      discreteValues:
      - 128
    - parameterName: common-layers
      type: CATEGORICAL
      categoricalValues:
      - '[256]'
    - parameterName: product-features
      type: CATEGORICAL
      categoricalValues:
      - '["Product_ID", "Product_Category_1"]'
    - parameterName: common-input-embedding-l1
      type: DISCRETE
      discreteValues:
      - 0.0001
      - 0.01

Overwriting config.yaml


In [11]:
MODEL_PATH = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME)
if tf.io.gfile.exists(MODEL_PATH):
    if FORCE:
        tf.io.gfile.rmtree(MODEL_PATH)
    else:
        assert not tf.io.gfile.exists(MODEL_PATH), f"{MODEL_PATH} already exists, set FORCE=True to overwrite"


os.environ["TRAIN_PATH"]               = str(TRAIN_PATH)
os.environ["EVAL_PATH"]                = str(EVAL_PATH)
os.environ["TEST_PATH"]                = str(TEST_PATH)
os.environ["MODEL_NAME"]               = str(MODEL_NAME)
os.environ["MODEL_PATH"]               = str(MODEL_PATH)
os.environ["BATCH_SIZE"]               = str(BATCH_SIZE)
os.environ["NUM_EPOCHS"]               = str(NUM_EPOCHS)
os.environ["SCANN_NUM_NEIGHBORS"]      = str(SCANN_NUM_NEIGHBORS)
#os.environ["LEARNING_RATE"]            = str(LEARNING_RATE)
os.environ["EMBEDDING_DIM"]            = str(EMBEDDING_DIM)
os.environ["USER_INPUT_EMBEDDING_DIM"] = str(USER_INPUT_EMBEDDING_DIM)
os.environ["USER_FEATURES"]            = json.dumps(USER_FEATURES)

## launch

In [12]:
%%bash

# JOB_NAME: the name of your job running on AI Platform.
JOB_NAME=bf_${MODEL_NAME}_$(date +%Y%m%d_%H%M%S)

# REGION: select a region from https://cloud.google.com/ai-platform/training/docs/regions
# or use the default '`us-central1`'. The region is where the model will be deployed.
REGION=europe-west1
PYTHON_VERSION=3.7
RUNTIME_VERSION=2.4

current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd ${current_dir}"/../../.."

config_file="${current_dir}/config.yaml"

gcloud ai-platform jobs submit training "${JOB_NAME}" \
  --package-path trainer/ \
  --module-name trainer.task \
  --region ${REGION} \
  --python-version ${PYTHON_VERSION} \
  --runtime-version ${RUNTIME_VERSION} \
  --job-dir "${MODEL_PATH}" \
  --config "${config_file}" \
  -- \
  --train-path="${TRAIN_PATH}" \
  --eval-path="${EVAL_PATH}" \
  --job-dir="${MODEL_PATH}" \
  --num-epochs=${NUM_EPOCHS} \
  --batch-size=${BATCH_SIZE} \
  --scann-num-neighbors=${SCANN_NUM_NEIGHBORS} \
  --user-features="${USER_FEATURES}" \
  --embedding-dim=${EMBEDDING_DIM} \
  --user-input-embedding-dim=${USER_INPUT_EMBEDDING_DIM}

#  --learning-rate=${LEARNING_RATE} \

gcloud ai-platform jobs describe ${JOB_NAME}

jobId: bf_010_hptuning_input_l1_20210603_145721
state: QUEUED
createTime: '2021-06-03T14:57:23Z'
etag: VQsbW6OZ54I=
jobId: bf_010_hptuning_input_l1_20210603_145721
state: PREPARING
trainingInput:
  args:
  - --train-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv
  - --eval-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv
  - --job-dir=gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/010_hptuning_input_l1
  - --num-epochs=100
  - --batch-size=30000
  - --scann-num-neighbors=100
  - --user-features=["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years",
    "Marital_Status"]
  - --embedding-dim=128
  - --user-input-embedding-dim=8
  hyperparameters:
    goal: MINIMIZE
    hyperparameterMetricTag: epoch_total_loss
    maxParallelTrials: 2
    maxTrials: 2
    params:
    - discreteValues:
      - 0.001
      parameterName: learning-rate
      type: DISCRETE
    -

Job [bf_010_hptuning_input_l1_20210603_145721] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe bf_010_hptuning_input_l1_20210603_145721

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs bf_010_hptuning_input_l1_20210603_145721

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/bf_010_hptuning_input_l1_20210603_145721?project=mlteam-ml-specialization-2021

View logs at:
https://console.cloud.google.com/logs?resource=ml_job%2Fjob_id%2Fbf_010_hptuning_input_l1_20210603_145721&project=mlteam-ml-specialization-2021


In [6]:
!rm -rf config.yaml

In [4]:
metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")

ml               = discovery.build('ml','v1')
projectId        = 'projects/{}'.format(PROJECT)
ai_platform_data = ml.projects().jobs().list(parent=projectId).execute()
jobs             = ai_platform_data["jobs"]
latest_job       = sorted([j for j in jobs if j['jobId'].startswith(f"bf_{MODEL_NAME}")], key=lambda x: x["jobId"])[-1]
if latest_job["trainingOutput"].get('isHyperparameterTuningJob',None) is not None:
    trials = latest_job["trainingOutput"]["trials"]
    json.dump(trials, tf.io.gfile.GFile(metadata_path, "w"))
else:
    metadata = [{
        'trialId'        : '1',
        'hyperparameters': {},
        'startTime'      : latest_job["startTime"],
        'endTime'        : latest_job["endTime"],
        'state'          : latest_job["state"],
    }]
    json.dump(metadata, tf.io.gfile.GFile(metadata_path, "w"))

# Evaluation

## ground truth

In [5]:
ALL_USER_FEATURES   = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"] 

df = pd.read_csv(TEST_PATH)

## baseline

In [6]:
class BlackFridayBaseline:
    def __init__(self, trainset_location, user_features_list, build_inplace=False):
        self.trainset_location, self.user_features_list = trainset_location, user_features_list
        if build_inplace:
            self.build()
        
    def build(self):
        self.df = pd.read_csv(self.trainset_location).groupby(
            self.user_features_list+["Product_ID"])["User_ID"].count().reset_index().rename(
            columns={"User_ID":"count_product"})
        
    def predict(self, user_features, n_products):
        mask = None
        for k,v in user_features.items():
            if mask is None:
                mask = self.df[k]==v
            else:
                mask = mask&(self.df[k]==v)
        return self.df[mask].sort_values(self.user_features_list+["count_product"])["Product_ID"].values[:n_products]
        

## model(s)

In [7]:
class PredictionModel:
    def __init__(self, model_path, build_inplace=False):
        self.model_path=model_path
        if build_inplace:
            self.build()
    
    def build(self):
        self.model=tf.keras.models.load_model(self.model_path)
        
    def predict_batch(self, model_input, n_products):
        scores, products = self.model({k:tf.constant(v) for k,v in model_input.items()})
        return products.numpy()[:,:n_products]
    
    def predict_single(self, model_input, n_products):
        scores, products = self.model({k:tf.constant([v]) for k,v in model_input.items()})
        return products.numpy()[0,:n_products]
    


    

## stats

In [8]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
df_grouped = pd.read_csv(TEST_PATH).groupby(ALL_USER_FEATURES)["Product_ID"].apply(list).apply(np.array).reset_index()
top_k = [1, 10, 50, 100]
results = dict()

baseline = BlackFridayBaseline(TRAIN_PATH, ALL_USER_FEATURES, True)
for k in tqdm(top_k, leave=False):
    metrics_id = f"baseline_top{k}"
    df_grouped[metrics_id]         = df_grouped[ALL_USER_FEATURES].progress_apply(lambda x: baseline.predict(x, k), axis=1)
    baseline_true_positives        = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
    baseline_false_negatives       = df.shape[0] - baseline_true_positives
    baseline_false_positives       = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - baseline_true_positives
    baseline_reach                 = df_grouped.progress_apply(lambda x: min(x["Product_ID"].shape[0],x[metrics_id].shape[0]), axis=1).sum()
    results[f"baseline_top{k}"]    = {
        "top"  : k,
        "model": "baseline",
        "tp"   : baseline_true_positives,
        "fp"   : baseline_false_positives,
        "fn"   : baseline_false_negatives,
        "reach": baseline_reach #portata
    }
    

metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")
metadata = json.load(tf.io.gfile.GFile(metadata_path, "r"))
for j in tqdm(metadata, leave=False):
    model = PredictionModel(os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, j['trialId'], 'Scann'), True)
    hps = json.dumps(j["hyperparameters"])
    model_id = f"trial_{j['trialId']}"
    for k in tqdm(top_k, leave=False):
        metrics_id=f"trial_{j['trialId']}_top{k}"
        df_grouped[metrics_id] = df_grouped[USER_FEATURES].progress_apply(lambda x: model.predict_single(x.astype(str), k).astype(str), axis=1)
        tp = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
        fn = df.shape[0] - tp
        fp = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - tp
        reach = df_grouped.progress_apply(lambda x: min(len(x["Product_ID"]),x[metrics_id].shape[0]), axis=1).sum() 
        results[metrics_id] = {
            "top"  : k,
            "model": model_id,
            "tp"   : tp,
            "fp"   : fp,
            "fn"   : fn,
            "reach": reach
        }
    
df_results = pd.DataFrame(results).T
df_results["precision"] = df_results["tp"]/(df_results["tp"]+df_results["fp"])
df_results["recall"] = df_results["tp"]/(df_results["tp"]+df_results["fn"])
df_results["tp_over_reach"] = df_results["tp"]/df_results["reach"]
df_results

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

Unnamed: 0,top,model,tp,fp,fn,reach,precision,recall,tp_over_reach
baseline_top1,1,baseline,637,1754,273647,2391,0.266416,0.002322,0.266416
baseline_top10,10,baseline,5184,14437,269100,19618,0.264207,0.0189,0.264247
baseline_top50,50,baseline,15198,39057,259086,54252,0.280122,0.05541,0.280137
baseline_top100,100,baseline,19780,47500,254504,67277,0.293995,0.072115,0.294008
trial_2_top1,1,trial_2,35,2395,274249,2430,0.014403,0.000128,0.014403
trial_2_top10,10,trial_2,333,23967,273951,23612,0.013704,0.001214,0.014103
trial_2_top50,50,trial_2,2065,119435,272219,91899,0.016996,0.007529,0.02247
trial_2_top100,100,trial_2,5019,237981,269265,146329,0.020654,0.018299,0.034299
trial_1_top1,1,trial_1,37,2393,274247,2430,0.015226,0.000135,0.015226
trial_1_top10,10,trial_1,339,23961,273945,23612,0.013951,0.001236,0.014357


In [9]:
df_results.pivot("model","top","precision")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.266416,0.264207,0.280122,0.293995
trial_1,0.015226,0.013951,0.017292,0.020309
trial_2,0.014403,0.013704,0.016996,0.020654


In [10]:
df_results.pivot("model","top","recall")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.002322,0.0189,0.05541,0.072115
trial_1,0.000135,0.001236,0.00766,0.017992
trial_2,0.000128,0.001214,0.007529,0.018299


In [11]:
df_results.pivot("model","top","tp_over_reach")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.266416,0.264247,0.280137,0.294008
trial_1,0.015226,0.014357,0.022862,0.033726
trial_2,0.014403,0.014103,0.02247,0.034299
