# TensorFlow Recommender System (tfrs) を用いた予測モデル作成

・tfrsはユーザーが商品を購入する履歴を利用してレコメンド結果を出力するシステムのモデル構築用ライブラリ

・Tensorflowにはtfrs.tasks.Retrievalやtfrs.metrics.FactorizedTopKのようなモジュールが一通り用意されており、データ準備、モデル定式化、トレーニング、評価、展開までワンストップで行える。

・TensorflowのScaNNライブラリは、与えられたクエリに対して最適な候補を取得するために使用できる。

・今回は、これらのライブラリを用いて、12個のレコメンデーションを取得する。

・使用データはcustomer_id, article_idのみとする。（顧客または商品のメタデータを入れた方が精度は良くなるかもしれないが、初回実装はシンプルに書く）

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q scann

[K     |████████████████████████████████| 85 kB 717 kB/s 
[K     |████████████████████████████████| 11.2 MB 5.0 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.7 kB/s 
[K     |████████████████████████████████| 438 kB 71.6 MB/s 
[K     |████████████████████████████████| 1.6 MB 53.1 MB/s 
[K     |████████████████████████████████| 5.8 MB 46.4 MB/s 
[?25h

In [2]:
# インポート
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
from pathlib import Path
from typing import Dict, Text

In [3]:
# Google colabで実行
# csvファイルはGoogleDriveに要アップロード --colabに置くと毎回消える
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## データセットの準備

In [4]:
# データ読み込み
transactions = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/transactions_rakus_train.csv", dtype = {'article_id':str})

# 2020-03-01以降のデータのみに絞る
transactions = transactions[transactions['t_dat'] >= '2020-03-01']

# article_idの冒頭に'0'を付け加える
transactions['article_id'] = transactions['article_id'].apply(lambda x: x.zfill(10))
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
22886405,2020-03-01,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,751628002,0.022017,1
22886406,2020-03-01,0008968c0d451dbc5a9968da03196fe20051965edde741...,675662028,0.035576,2
22886407,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,821152004,0.025407,2
22886408,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,860738001,0.025407,2
22886409,2020-03-01,00117f79ce61af038e143ee26448e8401fdbff51f48d5a...,822957002,0.045746,2


In [5]:
customers = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/customers_rakus.csv")
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [6]:
articles = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/articles.csv", dtype = {'article_id':str})

# article_idの冒頭に'0'を付け加える
articles['article_id'] = articles['article_id'].apply(lambda x: x.zfill(10))
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [7]:
# データ生成
unique_customer_ids = customers.customer_id.unique()
unique_article_ids = articles.article_id.unique()

article_ds = tf.data.Dataset.from_tensor_slices(dict(articles[['article_id']]))
articles_var = article_ds.map(lambda x: x['article_id'])

## モデルの構築

In [8]:
embedding_dimension = 64

# Query Modelの作成  -customer_idのベクトル変換-
customer_model = tf.keras.Sequential([
  # 文字列のカテゴリ値を整数インデックスに変換するレイヤー （Embedding層で読み込めるようにエンコーディング）
  tf.keras.layers.StringLookup(vocabulary = unique_customer_ids, mask_token = None),
  # 正の整数インデックスをベクトルに変換
  tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dimension)
])

In [9]:
# Candidate Modelの作成  -article_idのベクトル変換-
article_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary = unique_article_ids, mask_token = None),
  tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dimension)
])

In [10]:
# 学習・予測モデルの定義
# Retrieval Model  -customer_idとarticle_idを掛け合わせ、親和性の高いデータ同士を検索する-
class HandMModel(tfrs.Model):
    
    # コンストラクタ
    def __init__(self, customer_model, article_model):
        super().__init__()
        self.article_model: tf.keras.Model = article_model
        self.customer_model: tf.keras.Model = customer_model
        # tfrs.metrics.FactorizedTopKを用いてqueryとcandidateのデータ間の「近さ」を算出
        self.task = tfrs.tasks.Retrieval(
        metrics = tfrs.metrics.FactorizedTopK(
            candidates = articles_var.batch(128).map(self.article_model),            
            ),
        )        

    # 損失と評価を両方行う関数を定義
    def compute_loss(self, features: Dict[str, tf.Tensor], training = False) -> tf.Tensor:
    
        customer_embeddings = self.customer_model(features["customer_id"])    
        article_embeddings = self.article_model(features["article_id"])

        return self.task(customer_embeddings, article_embeddings, compute_metrics = not training)

## 訓練・検証

In [11]:
# モデルのインスタンス生成
model = HandMModel(customer_model, article_model)
model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.1))

In [12]:
# 訓練用と検証用に分割
train = transactions[transactions['t_dat'] < '2020-08-31'] # 3-8月のデータを使用
test = transactions[transactions['t_dat'] >= '2020-08-31'] # 最後の1週間を検証に使用

# データのシャッフル、バッチ化、キャッシュ
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id','article_id']])).shuffle(100_000).batch(256).cache()
test_ds = tf.data.Dataset.from_tensor_slices(dict(test[['customer_id','article_id']])).batch(256).cache()

# エポック数
num_epochs = 5

'''

history = model.fit(
    train_ds, 
    validation_data = test_ds,
    validation_freq=5,
    epochs=num_epochs,
    verbose=1)

'''


'\n\nhistory = model.fit(\n    train_ds, \n    validation_data = test_ds,\n    validation_freq=5,\n    epochs=num_epochs,\n    verbose=1)\n\n'

**A word on metrics:**

Calculation of factorized top K metric is highly time intensive. Even with the option 'compute_metrics=not training' and 
computing validation metrics only every 5 epochs, it still takes a lot of time. You can check this by running above model. Another option 
may be by reducing the number of retrievals from standard 100.(may cost accuracy?)

self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
        candidates=articles.batch(128).map(self.article_model),
        k = (any value less than 100)
        )

## 予測の実行と結果出力

In [13]:
# 検証データを用意せずに訓練
train_ds = tf.data.Dataset.from_tensor_slices(dict(transactions[['customer_id','article_id']])).shuffle(100_000).batch(256).cache()
num_epochs = 5
history = model.fit(
    train_ds,    
    epochs = num_epochs,
    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.customer_model, k = 12 )
scann_index.index_from_dataset(
  tf.data.Dataset.zip((articles_var.batch(100), articles_var.batch(100).map(model.article_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f234132b190>

In [15]:
sub = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sample_submission_rakus_latest.csv')
_, articles_var = scann_index(sub.customer_id.values)
preds = articles_var.numpy().astype(str)
preds = pd.Series(map(' '.join, preds,))
sub['prediction'] = preds
sub.to_csv('submission.csv', index = False)

### 参考

tfrsの解説
https://www.tensorflow.org/recommenders/examples/basic_retrieval

tfrs.layers.factorized_top_kの解説
https://www.tensorflow.org/recommenders/api_docs/python/tfrs/layers/factorized_top_k

kaggle記事
https://www.kaggle.com/code/viji1609/h-m-basic-retrieval-model-tf-recommender