In [1]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
import json
import os

from tqdm.auto import tqdm
tqdm.pandas(leave=False)

# Training

## configuration

In [2]:
GCS_ROOT                 = "gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs"
PROJECT                  = "mlteam-ml-specialization-2021"
TRAIN_PATH               = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv"
EVAL_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv"
TEST_PATH                = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/testset.csv"
RESUME_FROM              = "012_hptuning_deep"
TRIAL_NUMBER             = "1"
MODEL_NAME               = "12_resume_training_sample"
BATCH_SIZE               = 30000 #1 epoch -> 3 steps
NUM_EPOCHS               = 500
SCANN_NUM_NEIGHBORS      = 100
#LEARNING_RATE            = 0.01
EMBEDDING_DIM            = 128
USER_FEATURES            = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"]
#USER_INPUT_EMBEDDING_DIM = 8
FORCE                    = True

In [None]:
import os 
MODEL_PATH = os.path.join(GCS_ROOT, MODEL_NAME)
os.environ["GCS_ROOT"]                 = str(GCS_ROOT)
os.environ["RESUME_FROM"]              = str(RESUME_FROM)
os.environ["TRIAL_NUMBER"]             = str(TRIAL_NUMBER)
os.environ["TRAIN_PATH"]               = str(TRAIN_PATH)
os.environ["EVAL_PATH"]                = str(EVAL_PATH)
os.environ["TEST_PATH"]                = str(TEST_PATH)
os.environ["MODEL_NAME"]               = str(MODEL_NAME)
os.environ["MODEL_PATH"]               = str(MODEL_PATH)
os.environ["BATCH_SIZE"]               = str(BATCH_SIZE)
os.environ["NUM_EPOCHS"]               = str(NUM_EPOCHS)
os.environ["SCANN_NUM_NEIGHBORS"]      = str(SCANN_NUM_NEIGHBORS)
os.environ["LEARNING_RATE"]            = str(LEARNING_RATE)
os.environ["EMBEDDING_DIM"]            = str(EMBEDDING_DIM)
os.environ["USER_INPUT_EMBEDDING_DIM"] = str(USER_INPUT_EMBEDDING_DIM)
os.environ["USER_FEATURES"]            = json.dumps(USER_FEATURES)

## launch

In [None]:
!gsutil -m cp -r ${GCS_ROOT}/${RESUME_FROM}/${TRIAL_NUMBER} ${GCS_ROOT}/${MODEL_NAME}/

In [None]:
%%bash

# JOB_NAME: the name of your job running on AI Platform.
JOB_NAME=bf_${MODEL_NAME}_$(date +%Y%m%d_%H%M%S)

# REGION: select a region from https://cloud.google.com/ai-platform/training/docs/regions
# or use the default '`us-central1`'. The region is where the model will be deployed.
REGION=europe-west1
PYTHON_VERSION=3.7
RUNTIME_VERSION=2.4

current_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd ${current_dir}"/../../.."

gcloud ai-platform jobs submit training "${JOB_NAME}" \
  --package-path trainer/ \
  --module-name trainer.task \
  --region ${REGION} \
  --python-version ${PYTHON_VERSION} \
  --runtime-version ${RUNTIME_VERSION} \
  --job-dir "${MODEL_PATH}" \
  -- \
  --train-path="${TRAIN_PATH}" \
  --eval-path="${EVAL_PATH}" \
  --job-dir="${MODEL_PATH}" \
  --batch-size=${BATCH_SIZE} \
  --scann-num-neighbors=${SCANN_NUM_NEIGHBORS} \
  --user-features="${USER_FEATURES}" \
  --num-epochs=${NUM_EPOCHS} \
  --embedding-dim=${EMBEDDING_DIM} \
  --trial=${TRIAL_NUMBER}

gcloud ai-platform jobs describe ${JOB_NAME}

In [None]:
metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")

ml               = discovery.build('ml','v1')
projectId        = 'projects/{}'.format(PROJECT)
ai_platform_data = ml.projects().jobs().list(parent=projectId).execute()
jobs             = ai_platform_data["jobs"]
latest_job       = sorted([j for j in jobs if j['jobId'].startswith(f"bf_{MODEL_NAME}")], key=lambda x: x["jobId"])[-1]

metadata = [{
    'trialId'        : TRIAL_NUMBER,
    'hyperparameters': {},
    'startTime'      : latest_job["startTime"],
    'endTime'        : latest_job["endTime"],
    'state'          : latest_job["state"],
}]
json.dump(metadata, tf.io.gfile.GFile(metadata_path, "w"))

# Evaluation

## ground truth

In [None]:
ALL_USER_FEATURES   = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"] 

df = pd.read_csv(TEST_PATH)

## baseline

In [None]:
class BlackFridayBaseline:
    def __init__(self, trainset_location, user_features_list, build_inplace=False):
        self.trainset_location, self.user_features_list = trainset_location, user_features_list
        if build_inplace:
            self.build()
        
    def build(self):
        self.df = pd.read_csv(self.trainset_location).groupby(
            self.user_features_list+["Product_ID"])["User_ID"].count().reset_index().rename(
            columns={"User_ID":"count_product"})
        
    def predict(self, user_features, n_products):
        mask = None
        for k,v in user_features.items():
            if mask is None:
                mask = self.df[k]==v
            else:
                mask = mask&(self.df[k]==v)
        return self.df[mask].sort_values(self.user_features_list+["count_product"])["Product_ID"].values[:n_products]
        

## model(s)

In [None]:
class PredictionModel:
    def __init__(self, model_path, build_inplace=False):
        self.model_path=model_path
        if build_inplace:
            self.build()
    
    def build(self):
        self.model=tf.keras.models.load_model(self.model_path)
        
    def predict_batch(self, model_input, n_products):
        scores, products = self.model({k:tf.constant(v) for k,v in model_input.items()})
        return products.numpy()[:,:n_products]
    
    def predict_single(self, model_input, n_products):
        scores, products = self.model({k:tf.constant([v]) for k,v in model_input.items()})
        return products.numpy()[0,:n_products]
    


    

## stats

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
df_grouped = pd.read_csv(TEST_PATH).groupby(ALL_USER_FEATURES)["Product_ID"].apply(list).apply(np.array).reset_index()
top_k = [1, 10, 50, 100]
results = dict()

baseline = BlackFridayBaseline(TRAIN_PATH, ALL_USER_FEATURES, True)
for k in tqdm(top_k, leave=False):
    metrics_id = f"baseline_top{k}"
    df_grouped[metrics_id]         = df_grouped[ALL_USER_FEATURES].progress_apply(lambda x: baseline.predict(x, k), axis=1)
    baseline_true_positives        = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
    baseline_false_negatives       = df.shape[0] - baseline_true_positives
    baseline_false_positives       = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - baseline_true_positives
    baseline_reach                 = df_grouped.progress_apply(lambda x: min(x["Product_ID"].shape[0],x[metrics_id].shape[0]), axis=1).sum()
    results[f"baseline_top{k}"]    = {
        "top"  : k,
        "model": "baseline",
        "tp"   : baseline_true_positives,
        "fp"   : baseline_false_positives,
        "fn"   : baseline_false_negatives,
        "reach": baseline_reach #portata
    }
    

metadata_path = os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, "metadata.json")
metadata = json.load(tf.io.gfile.GFile(metadata_path, "r"))
for j in tqdm(metadata, leave=False):
    model = PredictionModel(os.path.join("gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs", MODEL_NAME, j['trialId'], 'Scann'), True)
    hps = json.dumps(j["hyperparameters"])
    model_id = f"trial_{j['trialId']}"
    for k in tqdm(top_k, leave=False):
        metrics_id=f"trial_{j['trialId']}_top{k}"
        df_grouped[metrics_id] = df_grouped[USER_FEATURES].progress_apply(lambda x: model.predict_single(x.astype(str), k).astype(str), axis=1)
        tp = df_grouped.progress_apply(lambda x: np.intersect1d(x["Product_ID"],x[metrics_id]).shape[0], axis=1).sum()
        fn = df.shape[0] - tp
        fp = df_grouped[metrics_id].apply(lambda x: x.shape[0]).sum() - tp
        reach = df_grouped.progress_apply(lambda x: min(len(x["Product_ID"]),x[metrics_id].shape[0]), axis=1).sum() 
        results[metrics_id] = {
            "top"  : k,
            "model": model_id,
            "tp"   : tp,
            "fp"   : fp,
            "fn"   : fn,
            "reach": reach
        }
    
df_results = pd.DataFrame(results).T
df_results["precision"] = df_results["tp"]/(df_results["tp"]+df_results["fp"])
df_results["recall"] = df_results["tp"]/(df_results["tp"]+df_results["fn"])
df_results["tp_over_reach"] = df_results["tp"]/df_results["reach"]
df_results

In [None]:
df_results.pivot("model","top","precision")

In [None]:
df_results.pivot("model","top","recall")

In [None]:
df_results.pivot("model","top","tp_over_reach")