In [1]:
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from preprocess import gen_data_set, gen_model_input
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss

## 以movielens数据为例，取200条样例数据进行流程演示

In [2]:
data = pd.read_csvdata = pd.read_csv("./data/movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", ]
SEQ_LEN = 50
negsample = 0

In [3]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     233 non-null    int64 
 1   movie_id    233 non-null    int64 
 2   rating      233 non-null    int64 
 3   timestamp   233 non-null    int64 
 4   title       233 non-null    object
 5   genres      233 non-null    object
 6   gender      233 non-null    object
 7   age         233 non-null    int64 
 8   occupation  233 non-null    int64 
 9   zip         233 non-null    int64 
dtypes: int64(7), object(3)
memory usage: 18.3+ KB


## 1. 首先对于数据中的特征进行ID化编码，然后使用 `gen_date_set` and `gen_model_input`来生成带有用户历史行为序列的特征数据

In [5]:
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']

feature_max_idx = {}
for feature in features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1  # 作为Embedding向量的大小设定
    feature_max_idx[feature] = data[feature].max() + 1

# 构建用户画像
user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')

# 构建物品画像
item_profile = data[["movie_id"]].drop_duplicates('movie_id')

user_profile.set_index("user_id", inplace=True)

user_item_list = data.groupby("user_id")['movie_id'].apply(list)

train_set, test_set = gen_data_set(data, negsample)

train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

100%|██████████| 3/3 [00:00<00:00, 1771.49it/s]

6 6





In [6]:
train_model_input

{'user_id': array([2, 2, 2, 2, 2, 2, 3, 2, 1, 3, 2, 1, 2, 1, 1, 3, 2, 2, 2, 2, 2, 2,
        1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 2, 3, 1, 2, 3, 3, 3, 3,
        1, 3, 2, 2, 3, 3, 2, 3, 2, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3,
        3, 2, 2, 2, 2, 2, 3, 1, 3, 2, 3, 1, 2, 2, 1, 3, 2, 2, 2, 2, 2, 3,
        2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 2, 2, 3, 3, 3, 2, 2,
        2, 2, 1, 2, 1, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2,
        2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 3, 3, 1, 2, 1, 2, 2, 1, 2,
        2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 3, 1,
        2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 3, 3, 2, 1, 2, 2, 2, 2,
        3, 3, 1, 2, 2, 2, 2, 1, 1, 3, 1, 2, 1, 2, 1, 2, 3, 2, 2, 1, 1, 2,
        2, 2, 1, 3, 3, 1, 1]),
 'movie_id': array([141,   2,  10, 208, 205,  91,  37,  37, 147, 137,  46, 164, 157,
         55,  66, 168,  42, 106,  24, 127, 171, 103, 150,  79, 111,  17,
        189, 190,  90,  75,  11, 192, 130,  72,  47,   7,  1

## 2. 配置一下模型定义需要的特征列，主要是特征名和embedding词表的大小

In [7]:
embedding_dim = 16
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                        SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
                        SparseFeat("age", feature_max_idx['age'], embedding_dim),
                        SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
                        SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        ]


item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

In [8]:
user_feature_columns

[SparseFeat(name='user_id', vocabulary_size=4, embedding_dim=16, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f354b49fc40>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=3, embedding_dim=16, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f354b49ff70>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=4, embedding_dim=16, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f354b49f7f0>, embedding_name='age', group_name='default_group', trainable=True),
 SparseFeat(name='occupation', vocabulary_size=4, embedding_dim=16, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializer

## 3. 定义一个YoutubeDNN模型，分别传入用户侧特征列表`user_feature_columns`和物品侧特征列表`item_feature_columns`。然后配置优化器和损失函数，开始进行训练。

In [9]:
K.set_learning_phase(True)
import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()

model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, 16))
# model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,p=1,k_max=2,num_sampled=5,user_dnn_hidden_units=(64,16),init_std=0.001)

model.compile(optimizer="adagrad", loss=sampledsoftmaxloss, experimental_run_tf_function=False)  # "binary_crossentropy")

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=256, epochs=1, verbose=1, validation_split=0.0, )

Instructions for updating:
Simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 227 samples


## 4. Generate user features for testing and full item features for retrieval

In [10]:
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
user_embs

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
(3, 16)


array([[-1.42878300e-04, -2.90917342e-05,  5.11118524e-05,
         2.78099560e-05,  1.21410012e-04,  4.24649261e-05,
         7.85671582e-05,  7.21368269e-05,  3.23963613e-05,
        -1.87562582e-05, -1.69102932e-04, -1.20822137e-04,
         1.70499668e-04, -1.02522172e-04,  8.23799783e-05,
        -1.01446785e-05],
       [ 3.77329307e-05,  2.10743765e-05,  6.51213704e-05,
         2.44242674e-05,  4.09079912e-05,  1.90657447e-05,
         8.69532596e-05, -5.24726929e-05,  2.47771709e-06,
        -6.45783584e-05,  6.87255815e-05,  3.87508808e-05,
         1.55657690e-04, -1.09456290e-04,  1.91140789e-05,
         4.37624039e-05],
       [ 6.55593103e-05,  6.00543535e-05,  9.22585173e-07,
        -7.92923529e-05,  5.48820135e-05,  2.55651648e-05,
         1.08859722e-05,  7.00186501e-05, -1.22338985e-04,
        -4.98426416e-05, -7.24236670e-05, -5.57290014e-06,
         1.19872311e-04, -5.07092191e-05,  8.66251357e-05,
         1.51961256e-04]], dtype=float32)

In [13]:
import numpy as np
np.linalg.norm(user_embs, axis=1, keepdims=True)

array([[0.00037899],
       [0.00026067],
       [0.00030519]], dtype=float32)

In [14]:
user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True)
user_embs

array([[-0.37699947, -0.07676161,  0.13486402,  0.0733795 ,  0.32035312,
         0.11204819,  0.20730773,  0.19034062,  0.08548122, -0.04949037,
        -0.44619593, -0.31880194,  0.44988135, -0.27051556,  0.21736826,
        -0.0267678 ],
       [ 0.14475423,  0.08084729,  0.24982406,  0.09369843,  0.15693466,
         0.07314161,  0.3335774 , -0.20130014,  0.00950523, -0.24774092,
         0.26365083,  0.14865938,  0.5971471 , -0.4199054 ,  0.07332703,
         0.167885  ],
       [ 0.21481155,  0.19677402,  0.00302294, -0.25980923,  0.17982633,
         0.08376679,  0.03566896,  0.22942302, -0.40085578, -0.16331434,
        -0.2373033 , -0.01826016,  0.39277348, -0.16615376,  0.2838358 ,
         0.49791607]], dtype=float32)

In [15]:
sum_ = 0 
for i in user_embs[0]:
    sum_ += i**2
np.sqrt(sum_)

0.999999975563176

In [16]:
print(item_embs.shape)
item_embs

(208, 16)


array([[ 3.61168059e-05, -1.15563525e-04,  9.50523681e-05, ...,
        -6.13575903e-05,  1.92484094e-05, -7.59257673e-05],
       [ 9.92620407e-05,  9.99673139e-05,  7.47564263e-05, ...,
         2.79854867e-04, -1.13261231e-04,  4.59334333e-05],
       [ 1.21779689e-04,  3.84851082e-05,  7.90571285e-05, ...,
         1.18564902e-04,  1.54509089e-05, -5.47535274e-06],
       ...,
       [ 2.83916102e-04, -2.50865094e-04, -2.67599418e-04, ...,
        -8.16918982e-05,  1.55319474e-04,  1.18487726e-04],
       [ 1.37671668e-04,  1.70444968e-04, -3.06352231e-05, ...,
        -9.51940820e-05, -2.41493690e-05,  4.39387040e-05],
       [ 1.12890142e-04,  4.27755658e-05,  1.79081821e-04, ...,
         7.08255102e-05,  1.83037235e-04,  3.75794953e-05]], dtype=float32)

## 5. [Optional] ANN search by faiss  and evaluate the result

In [17]:
test_true_label = {line[0]:[line[2]] for line in test_set}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 50)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred = [item_profile['movie_id'].values[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred, N=50)
        s.append(recall_score)
        if test_true_label[uid] in pred:
            hit += 1
    except:
        print(i)
print("recall", np.mean(s))
print("hr", hit / len(test_user_model_input['user_id']))

3it [00:00, 450.39it/s]

recall 0.0
hr 0.0



