In [1]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
import json
import os

from tqdm.auto import tqdm
tqdm.pandas(leave=False)

# Training

## configuration

In [2]:
GCS_ROOT     = "gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs"
TRIAL_NUMBER = 1
RESUME_FROM  = "021_longer_product_id_and_even_more_categories"

In [3]:
PROJECT                  = "mlteam-ml-specialization-2021"
TRAIN_PATH               = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/preprocessed_category_train_.csv"
EVAL_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/preprocessed_category_evalset.csv"
TEST_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/preprocessed_category_testset.csv"
MODEL_NAME               = "021b_longer_product_id_and_even_more_categories"
BATCH_SIZE               = 30000
NUM_EPOCHS               = 1
SCANN_NUM_NEIGHBORS      = 100
LEARNING_RATE            = 0.01
EMBEDDING_DIM            = 128
USER_FEATURES            = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"]
USER_INPUT_EMBEDDING_DIM = 8
COMMON_LAYERS            = [512, 256, 128]
PRODUCT_FEATURES         = ["Product_ID", "Product_Category_1", "Product_Category_1.2"]
PRODUCT_INPUT_EMBEDDING_DIM = 128
FORCE                    = False

In [4]:
import os 
MODEL_PATH = os.path.join(GCS_ROOT, MODEL_NAME)
os.environ["GCS_ROOT"]                 = str(GCS_ROOT)
os.environ["RESUME_FROM"]              = str(RESUME_FROM)
os.environ["TRIAL_NUMBER"]             = str(TRIAL_NUMBER)
os.environ["TRAIN_PATH"]               = str(TRAIN_PATH)
os.environ["EVAL_PATH"]                = str(EVAL_PATH)
os.environ["TEST_PATH"]                = str(TEST_PATH)
os.environ["MODEL_NAME"]               = str(MODEL_NAME)
os.environ["MODEL_PATH"]               = str(MODEL_PATH)
os.environ["BATCH_SIZE"]               = str(BATCH_SIZE)
os.environ["NUM_EPOCHS"]               = str(NUM_EPOCHS)
os.environ["SCANN_NUM_NEIGHBORS"]      = str(SCANN_NUM_NEIGHBORS)
os.environ["LEARNING_RATE"]            = str(LEARNING_RATE)
os.environ["EMBEDDING_DIM"]            = str(EMBEDDING_DIM)
os.environ["USER_INPUT_EMBEDDING_DIM"] = str(USER_INPUT_EMBEDDING_DIM)
os.environ["USER_FEATURES"]            = json.dumps(USER_FEATURES)
os.environ["COMMON_LAYERS"]            = json.dumps(COMMON_LAYERS)
os.environ["PRODUCT_FEATURES"]         = json.dumps(PRODUCT_FEATURES)
os.environ["PRODUCT_INPUT_EMBEDDING_DIM"] = str(PRODUCT_INPUT_EMBEDDING_DIM)

## launch

In [5]:
%%bash
gsutil -m cp -r ${GCS_ROOT}/${RESUME_FROM}/${TRIAL_NUMBER} ${GCS_ROOT}/${MODEL_NAME}/

Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/Scann/saved_model.pb...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/Scann/variables/variables.data-00000-of-00001...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/Scann/variables/variables.index...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/candidate/variables/variables.index...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/candidate/saved_model.pb...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021_longer_product_id_and_even_more_categories/1/model_checkpoints.data-00000-of-00001...
Copying gs://mlteam-ml-specialization-2021-blackfriday/aiplatfor

In [6]:
%%bash

# JOB_NAME: the name of your job running on AI Platform.
JOB_NAME=bf_${MODEL_NAME}_$(date +%Y%m%d_%H%M%S)

# REGION: select a region from https://cloud.google.com/ai-platform/training/docs/regions
# or use the default '`us-central1`'. The region is where the model will be deployed.
REGION=europe-west1
PYTHON_VERSION=3.7
RUNTIME_VERSION=2.4

current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd ${current_dir}"/../../.."

gcloud ai-platform jobs submit training "${JOB_NAME}" \
  --package-path trainer/ \
  --module-name trainer.task \
  --region ${REGION} \
  --python-version ${PYTHON_VERSION} \
  --runtime-version ${RUNTIME_VERSION} \
  --job-dir "${MODEL_PATH}" \
  -- \
  --train-path="${TRAIN_PATH}" \
  --eval-path="${EVAL_PATH}" \
  --job-dir="${MODEL_PATH}" \
  --batch-size=${BATCH_SIZE} \
  --scann-num-neighbors=${SCANN_NUM_NEIGHBORS} \
  --user-features="${USER_FEATURES}" \
  --num-epochs=${NUM_EPOCHS} \
  --user-input-embedding-dim=${USER_INPUT_EMBEDDING_DIM} \
  --learning-rate=${LEARNING_RATE} \
  --embedding-dim=${EMBEDDING_DIM} \
  --product-input-embedding-dim=${PRODUCT_INPUT_EMBEDDING_DIM} \
  --common-layers="${COMMON_LAYERS}" \
  --product-features="${PRODUCT_FEATURES}" \
  --trial=${TRIAL_NUMBER} 

gcloud ai-platform jobs describe ${JOB_NAME}

jobId: bf_021b_longer_product_id_and_even_more_categories_20210628_092004
state: QUEUED
createTime: '2021-06-28T09:20:07Z'
etag: yDEy6TZNjEw=
jobId: bf_021b_longer_product_id_and_even_more_categories_20210628_092004
state: PREPARING
trainingInput:
  args:
  - --train-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/preprocessed_category_train_.csv
  - --eval-path=gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/preprocessed_category_evalset.csv
  - --job-dir=gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/021b_longer_product_id_and_even_more_categories
  - --batch-size=30000
  - --scann-num-neighbors=100
  - --user-features=["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years",
    "Marital_Status"]
  - --num-epochs=1
  - --user-input-embedding-dim=8
  - --learning-rate=0.01
  - --embedding-dim=128
  - --product-input-embedding-dim=128
  - --common-layers=[512, 256, 128]
  - --product-f

Job [bf_021b_longer_product_id_and_even_more_categories_20210628_092004] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe bf_021b_longer_product_id_and_even_more_categories_20210628_092004

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs bf_021b_longer_product_id_and_even_more_categories_20210628_092004

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/bf_021b_longer_product_id_and_even_more_categories_20210628_092004?project=mlteam-ml-specialization-2021

View logs at:
https://console.cloud.google.com/logs?resource=ml_job%2Fjob_id%2Fbf_021b_longer_product_id_and_even_more_categories_20210628_092004&project=mlteam-ml-specialization-2021


In [7]:
metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")

ml               = discovery.build('ml','v1')
projectId        = 'projects/{}'.format(PROJECT)
ai_platform_data = ml.projects().jobs().list(parent=projectId).execute()
jobs             = ai_platform_data["jobs"]
latest_job       = sorted([j for j in jobs if j['jobId'].startswith(f"bf_{MODEL_NAME}")], key=lambda x: x["jobId"])[-1]

metadata = [{
    'trialId'        : TRIAL_NUMBER,
    'hyperparameters': {},
    'startTime'      : latest_job["startTime"],
    'endTime'        : latest_job["endTime"],
    'state'          : latest_job["state"],
}]
json.dump(metadata, tf.io.gfile.GFile(metadata_path, "w"))

# Evaluation

## ground truth

In [8]:
ALL_USER_FEATURES   = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"] 

df = pd.read_csv(TEST_PATH)

## baseline

In [9]:
class BlackFridayBaseline:
    def __init__(self, trainset_location, user_features_list, build_inplace=False):
        self.trainset_location, self.user_features_list = trainset_location, user_features_list
        if build_inplace:
            self.build()
        
    def build(self):
        self.df = pd.read_csv(self.trainset_location).groupby(
            self.user_features_list+["Product_ID"])["User_ID"].count().reset_index().rename(
            columns={"User_ID":"count_product"})
        
    def predict(self, user_features, n_products):
        mask = None
        for k,v in user_features.items():
            if mask is None:
                mask = self.df[k]==v
            else:
                mask = mask&(self.df[k]==v)
        return self.df[mask].sort_values(self.user_features_list+["count_product"])["Product_ID"].values[:n_products]
        

## model(s)

In [10]:
class PredictionModel:
    def __init__(self, model_path, build_inplace=False):
        self.model_path=model_path
        if build_inplace:
            self.build()
    
    def build(self):
        self.model=tf.keras.models.load_model(self.model_path)
        
    def predict_batch(self, model_input, n_products):
        scores, products = self.model({k:tf.constant(v) for k,v in model_input.items()})
        return products.numpy()[:,:n_products]
    
    def predict_single(self, model_input, n_products):
        scores, products = self.model({k:tf.constant([v]) for k,v in model_input.items()})
        return products.numpy()[0,:n_products]
    


    

## stats

In [11]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
df_grouped = pd.read_csv(TEST_PATH).groupby(ALL_USER_FEATURES)["Product_ID"].apply(list).apply(np.array).reset_index()
top_k = [1, 10, 50, 100]
results = dict()

baseline = BlackFridayBaseline(TRAIN_PATH, ALL_USER_FEATURES, True)
for k in tqdm(top_k, leave=False):
    metrics_id = f"baseline_top{k}"
    df_grouped[metrics_id]         = df_grouped[ALL_USER_FEATURES].progress_apply(lambda x: baseline.predict(x, k), axis=1)
    baseline_true_positives        = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
    baseline_false_negatives       = df.shape[0] - baseline_true_positives
    baseline_false_positives       = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - baseline_true_positives
    baseline_reach                 = df_grouped.progress_apply(lambda x: min(x["Product_ID"].shape[0],x[metrics_id].shape[0]), axis=1).sum()
    results[f"baseline_top{k}"]    = {
        "top"  : k,
        "model": "baseline",
        "tp"   : baseline_true_positives,
        "fp"   : baseline_false_positives,
        "fn"   : baseline_false_negatives,
        "reach": baseline_reach #portata
    }
    

metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")
metadata = json.load(tf.io.gfile.GFile(metadata_path, "r"))


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

In [12]:
for j in tqdm(metadata, leave=False):
    model = PredictionModel(os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, str(j['trialId']), 'Scann'), True)
    hps = json.dumps(j["hyperparameters"])
    model_id = f"trial_{j['trialId']}"
    for k in tqdm(top_k, leave=False):
        metrics_id=f"trial_{j['trialId']}_top{k}"
        df_grouped[metrics_id] = df_grouped[USER_FEATURES].progress_apply(lambda x: model.predict_single(x.astype(str), k).astype(str), axis=1)
        tp = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
        fn = df.shape[0] - tp
        fp = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - tp
        reach = df_grouped.progress_apply(lambda x: min(len(x["Product_ID"]),x[metrics_id].shape[0]), axis=1).sum() 
        results[metrics_id] = {
            "top"  : k,
            "model": model_id,
            "tp"   : tp,
            "fp"   : fp,
            "fn"   : fn,
            "reach": reach
        }
    
df_results = pd.DataFrame(results).T
df_results["precision"] = df_results["tp"]/(df_results["tp"]+df_results["fp"])
df_results["recall"] = df_results["tp"]/(df_results["tp"]+df_results["fn"])
df_results["tp_over_reach"] = df_results["tp"]/df_results["reach"]
df_results

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

  0%|          | 0/2430 [00:00<?, ?it/s]

Unnamed: 0,top,model,tp,fp,fn,reach,precision,recall,tp_over_reach
baseline_top1,1,baseline,637,1754,273647,2391,0.266416,0.002322,0.266416
baseline_top10,10,baseline,5184,14437,269100,19618,0.264207,0.0189,0.264247
baseline_top50,50,baseline,15198,39057,259086,54252,0.280122,0.05541,0.280137
baseline_top100,100,baseline,19780,47500,254504,67277,0.293995,0.072115,0.294008
trial_1_top1,1,trial_1,349,2081,273935,2430,0.143621,0.001272,0.143621
trial_1_top10,10,trial_1,3012,21288,271272,23612,0.123951,0.010981,0.127562
trial_1_top50,50,trial_1,13846,107654,260438,91899,0.113959,0.050481,0.150665
trial_1_top100,100,trial_1,26898,216102,247386,146329,0.110691,0.098066,0.183819


In [13]:
df_results.pivot("model","top","precision")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.266416,0.264207,0.280122,0.293995
trial_1,0.143621,0.123951,0.113959,0.110691


In [14]:
df_results.pivot("model","top","recall")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.002322,0.0189,0.05541,0.072115
trial_1,0.001272,0.010981,0.050481,0.098066


In [15]:
df_results.pivot("model","top","tp_over_reach")

top,1,10,50,100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseline,0.266416,0.264247,0.280137,0.294008
trial_1,0.143621,0.127562,0.150665,0.183819
