# 🧬 Training pipeline: Training ranking model </span>

In this notebook, you will train a ranking model using gradient boosted trees. 

In [1]:
%load_ext autoreload
%autoreload 2

import warnings

warnings.filterwarnings("ignore")

from recsys.config import settings
from recsys.data.preprocessing.splitting import train_test_split
from recsys.gcp.vertex_ai.serving.ranking import GCPRankingModel
from recsys.core.models.two_tower.ranking import (
    RankingModelFactory,
    RankingModelTrainer,
)
from recsys.gcp.feature_store import client as fs_client
from recsys.gcp.bigquery import client as bq_client

In [2]:
dict(settings)

{'env_path': '/Users/galcala/Desktop/Github/GenAI_Custom_Real_Time_Personalized_Recommender/.env',
 'GCP_PROJECT': 'recsys-dev-gonzo-2',
 'GCP_LOCATION': 'us-central1',
 'GCP_CREDENTIALS': '/Users/galcala/Desktop/Github/GenAI_Custom_Real_Time_Personalized_Recommender/recsys-dev-gonzo-2-5d2ef03ac656.json',
 'VERTEX_FEATURE_STORE_ID': 'recsys_feature_store_dev',
 'VERTEX_FEATURE_STORE_INSTANCE_ID': 'recsys_feature_store_instance_dev',
 'GCP_ARTIFACT_REGISTRY': 'recsys-model-registry',
 'GCP_MODEL_REGISTRY': 'gonzo-recsys-models',
 'GCP_ENDPOINT_ID': 'recsys-endpoint-dev',
 'GCS_DATA_BUCKET': 'gonzo-recsys-data',
 'GCS_MODEL_BUCKET': 'gonzo-recsys-models',
 'GCS_ARTIFACT_BUCKET': 'gonzo-recsys-artifacts',
 'GEMINI_AGENT_ID': 'your-gemini-agent-id',
 'GEMINI_AGENT_API_KEY': SecretStr('**********'),
 'BIGQUERY_DATASET_ID': 'recsys_dataset',
 'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,


## ☁️ Connect to Vertex AI Feature Online Store

In [3]:
fs_client.initialize()
fos = fs_client.get_client()

[32m2025-02-28 11:54:06.807[0m | [1mINFO    [0m | [36mrecsys.gcp.feature_store.client[0m:[36mget_client[0m:[36m31[0m - [1mRetrieving Feature Store from us-central1/recsys-dev-gonzo-2/recsys_feature_store_dev[0m


## 💿 Create training dataset

In [4]:
trans_fv, articles_fv, customers_f, rankings_fv = fs_client.get_feature_views(fos)

In [5]:
rankings_df = bq_client.fetch_feature_view_data(feature_view=rankings_fv)
trans_df = bq_client.fetch_feature_view_data(feature_view=trans_fv)
articles_df = bq_client.fetch_feature_view_data(
    feature_view=articles_fv,
    except_columns=["embeddings", "image_url", "article_description"],
)

[32m2025-02-28 11:54:11.176[0m | [1mINFO    [0m | [36mrecsys.gcp.bigquery.client[0m:[36mfetch_feature_view_data[0m:[36m185[0m - [1mFetching data from feature view: rankings[0m
[32m2025-02-28 11:54:11.459[0m | [1mINFO    [0m | [36mrecsys.gcp.bigquery.client[0m:[36mfetch_feature_view_data[0m:[36m198[0m - [1mExecuting query: SELECT * FROM `recsys-dev-gonzo-2.recsys_dataset.recsys_rankings`[0m
[32m2025-02-28 11:54:24.758[0m | [1mINFO    [0m | [36mrecsys.gcp.bigquery.client[0m:[36mfetch_feature_view_data[0m:[36m201[0m - [1mDataFrame shape: (40752, 15)[0m
[32m2025-02-28 11:54:24.759[0m | [1mINFO    [0m | [36mrecsys.gcp.bigquery.client[0m:[36mfetch_feature_view_data[0m:[36m185[0m - [1mFetching data from feature view: transactions[0m
[32m2025-02-28 11:54:25.058[0m | [1mINFO    [0m | [36mrecsys.gcp.bigquery.client[0m:[36mfetch_feature_view_data[0m:[36m198[0m - [1mExecuting query: SELECT * FROM `recsys-dev-gonzo-2.recsys_dataset.recsys_t

In [6]:
rankings_df = rankings_df.join(
    trans_df.select(["customer_id", "month_sin", "month_cos"]),
    on="customer_id",
    how="left",
)

In [7]:
rankings_df = rankings_df.join(
    articles_df.select(["article_id", "colour_group_name"]),
    on="article_id",
    how="left",
)

In [8]:
rankings_df = rankings_df.drop("customer_id", "article_id")

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    df=rankings_df,
    test_size=settings.RANKING_DATASET_VALIDATION_SPLIT_SIZE,
)

[32m2025-02-28 11:54:58.393[0m | [1mINFO    [0m | [36mrecsys.data.preprocessing.splitting[0m:[36mtrain_test_split[0m:[36m134[0m - [1mRandom split: train=3021740 rows (90.0%), test=335415 rows (10.0%)[0m


In [12]:
X_train.head(3)

age,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,month_sin,month_cos,colour_group_name_right
f64,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str
71.0,"""Trousers""","""Garment Lower body""","""Denim""","""Light Blue""","""Dusty Light""","""Blue""","""AK Bottoms""","""Divided""","""Divided""","""Divided Asia keys""","""Trousers""",0.5,0.866025,"""Light Blue"""
71.0,"""Trousers""","""Garment Lower body""","""Denim""","""Light Blue""","""Dusty Light""","""Blue""","""AK Bottoms""","""Divided""","""Divided""","""Divided Asia keys""","""Trousers""",0.5,0.866025,"""Light Blue"""
71.0,"""Trousers""","""Garment Lower body""","""Denim""","""Light Blue""","""Dusty Light""","""Blue""","""AK Bottoms""","""Divided""","""Divided""","""Divided Asia keys""","""Trousers""",0.5,0.866025,"""Light Blue"""


In [11]:
y_train.head(3)

label
i64
1
1
1


# Training the ranking model

Let's train the ranking model:

In [12]:
model = RankingModelFactory.build()

In [None]:
tasasaasasasasasasaszasasassssşaasstteastest
aaaaaasd

In [None]:
trainer.fit()

In [None]:
optimal_threshold, best_metrics = trainer.find_optimal_threshold(step=0.05)

## Evaluating the ranking model

Next, you'll evaluate how well the model performs on the validation data using metrics for classification such as precision, recall and f1-score:

In [None]:
metrics = trainer.evaluate(log=True, threshold=optimal_threshold)

In [None]:
metrics

It can be seen that the model has a low F1-score on the positive class (higher is better). The performance could potentially be improved by adding more features to the dataset, e.g. image embeddings.

Let's see which features your model considers important.

In [None]:
trainer.get_feature_importance()

# 🗄️ Upload models to Vertex AI model registry

In [19]:
ranking_model_gcp = GCPRankingModel(model=model)

In [None]:
ranking_model_vertex = ranking_model_gcp.upload_to_vertex_ai(
    model_name="ranking_model_v1",
    description="Ranking model for two-tower recommender system",
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.2-1:latest",
)

In [21]:
# rankind_endpoint = ranking_model_gcp.deploy_endpoint(
#     model=ranking_model_vertex,
#     endpoint_id="recsys-rankings-model",
# )