# 🧬 Training pipeline: Training ranking model </span>

In this notebook, you will train a ranking model using gradient boosted trees. 

In [21]:
%load_ext autoreload
%autoreload 2

import warnings

warnings.filterwarnings("ignore")


from recsys.gcp_integrations.ranking_serving import GCPRankingModel
from recsys import gcp_integrations, training, utils
from recsys.config import settings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ☁️ Connect to Vertex AI Feature Online Store

In [2]:
fos = gcp_integrations.get_feature_store()

[32m2025-02-18 12:01:12.882[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.feature_store[0m:[36mget_feature_store[0m:[36m23[0m - [1mRetrieving Feature Store from us-central1/recsys-dev-gonzo/recsys_feature_store_dev[0m


## 💿 Create training dataset

In [3]:
trans_fv, articles_fv, customers_f, rankings_fv = gcp_integrations.feature_store.create_retrieval_feature_view(fos)

In [4]:
rankings_df = gcp_integrations.bq_utils.fetch_feature_view_data(feature_view=rankings_fv)
trans_df = gcp_integrations.bq_utils.fetch_feature_view_data(feature_view=trans_fv)
articles_df = gcp_integrations.bq_utils.fetch_feature_view_data(feature_view=articles_fv)

[32m2025-02-18 12:01:17.586[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.bq_utils[0m:[36mfetch_feature_view_data[0m:[36m162[0m - [1mStarting to fetch data from feature view: rankings[0m
[32m2025-02-18 12:01:18.123[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.bq_utils[0m:[36mfetch_feature_view_data[0m:[36m176[0m - [1mExecuting query: SELECT * FROM `recsys-dev-gonzo.recsys_dataset.recsys_rankings`[0m
[32m2025-02-18 12:02:13.862[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.bq_utils[0m:[36mfetch_feature_view_data[0m:[36m180[0m - [1mDataFrame shape: (224136, 14)[0m
[32m2025-02-18 12:02:13.863[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.bq_utils[0m:[36mfetch_feature_view_data[0m:[36m162[0m - [1mStarting to fetch data from feature view: transactions[0m
[32m2025-02-18 12:02:14.461[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.bq_utils[0m:[36mfetch_feature_view_data[0m:[36m176[0m - [1mExecuting query: SELECT * F

In [5]:
rankings_df = rankings_df.join(
    trans_df.select(["customer_id", "month_sin", "month_cos"]),
    on='customer_id',
    how='left'
)

In [6]:
rankings_df = rankings_df.rename({"perceived_colour_master_name":"colour_group_name"})
rankings_df.head(3)

customer_id,article_id,age,label,product_type_name,product_group_name,graphical_appearance_name,perceived_colour_value_name,colour_group_name,department_name,index_name,index_group_name,section_name,garment_group_name,month_sin,month_cos
str,str,f64,i64,str,str,str,str,str,str,str,str,str,str,f64,f64
"""b4397f1ef360771e411afa9b43ad9a…","""399061008""",66.0,0,"""Jacket""","""Garment Upper body""","""Denim""","""Dusty Light""","""Blue""","""Outwear""","""Ladieswear""","""Ladieswear""","""Womens Jackets""","""Outdoor""",-0.866025,0.5
"""b4397f1ef360771e411afa9b43ad9a…","""399061008""",66.0,0,"""Jacket""","""Garment Upper body""","""Denim""","""Dusty Light""","""Blue""","""Outwear""","""Ladieswear""","""Ladieswear""","""Womens Jackets""","""Outdoor""",1.0,6.1232e-17
"""b4397f1ef360771e411afa9b43ad9a…","""399061008""",66.0,0,"""Jacket""","""Garment Upper body""","""Denim""","""Dusty Light""","""Blue""","""Outwear""","""Ladieswear""","""Ladieswear""","""Womens Jackets""","""Outdoor""",1.0,6.1232e-17


In [7]:
X_train, X_val, y_train, y_val = utils.split_utils.train_test_split(df=rankings_df,
                                                                    test_size=settings.RANKING_DATASET_VALIDATION_SPLIT_SIZE,
                                                                    description="Ranking training dataset")

In [8]:
X_train.head(3)

customer_id,article_id,age,product_type_name,product_group_name,graphical_appearance_name,perceived_colour_value_name,colour_group_name,department_name,index_name,index_group_name,section_name,garment_group_name,month_sin,month_cos
str,str,f64,str,str,str,str,str,str,str,str,str,str,f64,f64
"""95d36d7665d46a9406653e204ea2f1…","""337991001""",52.0,"""Belt""","""Accessories""","""Solid""","""Dark""","""Black""","""Belts""","""Menswear""","""Menswear""","""Men Accessories""","""Accessories""",0.866025,0.5
"""78e1fc883f6bcff9bd576587e46dfa…","""571619001""",44.0,"""Swimsuit""","""Swimwear""","""Solid""","""Dark""","""Black""","""Swimwear""","""Lingeries/Tights""","""Ladieswear""","""Womens Swimwear, beachwear""","""Swimwear""",0.5,-0.866025
"""19ed12b3909e6d4fdb7e858c894d10…","""708258002""",34.0,"""Top""","""Garment Upper body""","""Embroidery""","""Medium Dusty""","""Pink""","""Tops Fancy Jersey""","""Divided""","""Divided""","""Divided Collection""","""Jersey Fancy""",-1.0,-1.837e-16


In [9]:
y_train.head(3)

label
i64
0
0
0


# Training the ranking model

Let's train the ranking model:

In [10]:
model = training.ranking.RankingModelFactory.build()

In [11]:
trainer = training.ranking.RankingModelTrainer(
    model=model,
    train_dataset=(X_train, y_train),
    eval_dataset=(X_val, y_val)
)

In [12]:
trainer.fit()

[0]	validation_0-logloss:0.59661
[1]	validation_0-logloss:0.52974
[2]	validation_0-logloss:0.48144
[3]	validation_0-logloss:0.44632
[4]	validation_0-logloss:0.42036
[5]	validation_0-logloss:0.38680
[6]	validation_0-logloss:0.36753
[7]	validation_0-logloss:0.35567
[8]	validation_0-logloss:0.33483
[9]	validation_0-logloss:0.32158
[10]	validation_0-logloss:0.30754
[11]	validation_0-logloss:0.29944
[12]	validation_0-logloss:0.29390
[13]	validation_0-logloss:0.28837
[14]	validation_0-logloss:0.28351
[15]	validation_0-logloss:0.27890
[16]	validation_0-logloss:0.27286
[17]	validation_0-logloss:0.26489
[18]	validation_0-logloss:0.25976
[19]	validation_0-logloss:0.25459
[20]	validation_0-logloss:0.24786
[21]	validation_0-logloss:0.24331
[22]	validation_0-logloss:0.23974
[23]	validation_0-logloss:0.23294
[24]	validation_0-logloss:0.22871
[25]	validation_0-logloss:0.22667
[26]	validation_0-logloss:0.22535
[27]	validation_0-logloss:0.22088
[28]	validation_0-logloss:0.21614
[29]	validation_0-loglos

## Evaluating the ranking model

Next, you'll evaluate how well the model performs on the validation data using metrics for classification such as precision, recall and f1-score:

In [13]:
metrics = trainer.evaluate(log=True)

[32m2025-02-18 12:04:11.981[0m | [1mINFO    [0m | [36mrecsys.training.ranking[0m:[36mevaluate[0m:[36m111[0m - [1m              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1669305
           1       0.76      0.99      0.86    166480

    accuracy                           0.97   1835785
   macro avg       0.88      0.98      0.92   1835785
weighted avg       0.98      0.97      0.97   1835785
[0m


It can be seen that the model has a low F1-score on the positive class (higher is better). The performance could potentially be improved by adding more features to the dataset, e.g. image embeddings.

Let's see which features your model considers important.

In [14]:
trainer.get_feature_importance()

{'customer_id': 0.31630173,
 'age': 0.20964463,
 'index_group_name': 0.06936192,
 'index_name': 0.05168445,
 'month_sin': 0.041389048,
 'section_name': 0.03679398,
 'product_group_name': 0.035410024,
 'month_cos': 0.033701986,
 'garment_group_name': 0.033644374,
 'department_name': 0.03175092,
 'graphical_appearance_name': 0.030610219,
 'perceived_colour_value_name': 0.028639892,
 'colour_group_name': 0.028021397,
 'product_type_name': 0.027263217,
 'article_id': 0.025782196}

# 🗄️ Upload models to Vertex AI model registry

In [17]:
ranking_model_gcp = GCPRankingModel(model=model)

In [18]:
ranking_local_path = ranking_model_gcp.save_to_local("ranking_model")

In [22]:
ranking_model_vertex = ranking_model_gcp.upload_to_vertex_ai(
    model_name="ranking_model_v1",
    description="Ranking model for two-tower recommender system",
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.2-1:latest"
)

[32m2025-02-18 12:10:59.464[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.model_registry[0m:[36mupload_model_to_registry[0m:[36m25[0m - [1mSaving model into: /tmp/ranking_model_v1[0m
[32m2025-02-18 12:10:59.472[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.model_registry[0m:[36mupload_model_to_registry[0m:[36m44[0m - [1mUploading model to ranking_model_v1 to Vertex AI[0m


Creating Model
Create Model backing LRO: projects/99924275580/locations/us-central1/models/7845320028902653952/operations/5193890516395098112
Model created. Resource name: projects/99924275580/locations/us-central1/models/7845320028902653952@1
To use this Model in another session:
model = aiplatform.Model('projects/99924275580/locations/us-central1/models/7845320028902653952@1')


[32m2025-02-18 12:11:04.898[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.model_registry[0m:[36mupload_model_to_registry[0m:[36m54[0m - [1mModel uploaded with resource name: projects/99924275580/locations/us-central1/models/7845320028902653952[0m


In [23]:
rankind_endpoint = ranking_model_gcp.deploy_endpoint(
    model=ranking_model_vertex,
    endpoint_id="recsys-rankings-model",
)

[32m2025-02-18 12:11:10.302[0m | [1mINFO    [0m | [36mrecsys.gcp_integrations.two_tower_serving[0m:[36mdeploy_endpoint[0m:[36m70[0m - [1mFound existing endpoint: recsys-rankings-model[0m


Deploying model to Endpoint : projects/99924275580/locations/us-central1/endpoints/recsys-rankings-model
Deploy Endpoint model backing LRO: projects/99924275580/locations/us-central1/endpoints/recsys-rankings-model/operations/3807907731071827968


KeyboardInterrupt: 