# LFM

In [None]:
import os
import pickle
from pathlib import Path
import pandas as pd
from lightfm import LightFM
from scipy import sparse

# 指定lfm路径
data_dir = '/home/xuming/workspace/h-and-m-personalized-fashion-recommendations'
os.makedirs(f"{data_dir}/lfm", exist_ok=True)

LIGHTFM_PARAMS = {
    'learning_schedule': 'adadelta', # 优化器 # https://zh.d2l.ai/chapter_optimization/adadelta.html
    'loss': 'bpr', # bpr loss， https://blog.csdn.net/qq_35541614/article/details/103816504
    'learning_rate': 0.005, # 学习率
    'random_state': 42, # 随机种子
}

EPOCHS = 100 # epoch数量

def user_item(week: int, dim: int):
    '''
    user-item矩阵
    '''
    path_prefix = f"{data_dir}/lfm/lfm_i_i_week{week}_dim{dim}" # i2i保存路径
    print(path_prefix) 
    transactions = pd.read_pickle(f"{data_dir}/transactions_train.pkl") # 读取transactions_train
    users = pd.read_pickle(f"{data_dir}/users.pkl") # 读取users
    items = pd.read_pickle(f"{data_dir}/items.pkl") # 读取items
    n_user = len(users) # users 数量
    n_item = len(items) # items 数量
    a = transactions.query("@week <= week")[['user', 'item']].drop_duplicates(ignore_index=True) # 只保留近n周 user-item对，并且去重
    a_train = sparse.lil_matrix((n_user, n_item)) # 创建user-item矩阵
    a_train[a['user'], a['item']] = 1 # 所有存在user-item对的默认值都是1

    lightfm_params = LIGHTFM_PARAMS.copy() # 获取lightfm params
    lightfm_params['no_components'] = dim # 加入参数 no_components = dim，隐向量的维度

    model = LightFM(**lightfm_params) # 创建LightFM模型
    model.fit(a_train, epochs=EPOCHS, num_threads=4, verbose=True) # 训练user-item矩阵
    # 保存模型
    save_path = f"{path_prefix}_model.pkl" 
    with open(save_path, 'wb') as f:
        pickle.dump(model, f)

for week in range(1, 14):
    user_item(week, 16)

# user_features

In [None]:
import os
from pathlib import Path
import pandas as pd
import vaex # 类似pandas的高性能库

# 保存的路径
data_dir = '/home/xuming/workspace/h-and-m-personalized-fashion-recommendations'
os.makedirs(f"{data_dir}/user_features", exist_ok=True)

def create_user_ohe_agg(week):
    '''
    对各个商品属性特征做onehot编码, 并入交易表, 然后groupby每个user, 并且在交易样本中agg平均
    '''
    transactions = pd.read_pickle(f'{data_dir}/transactions_train.pkl')[['user', 'item', 'week']] # 读取 transactions_train.pkl
    users = pd.read_pickle(f'{data_dir}/users.pkl') # 读取 users.pkl
    items = pd.read_pickle(f'{data_dir}/items.pkl') # 读取 items.pkl

    tr = vaex.from_pandas(transactions.query("week >= @week")[['user', 'item']]) # 行:只保留近n周, 列:user-item对

    target_columns = [c for c in items.columns if c.endswith('_idx')] # items 标签编码列
    for c in target_columns:
        tmp = tr.join(vaex.from_pandas(pd.get_dummies(items[['item', c]], columns=[c])), on='item') # 加入item表所有列的onehot编码
        tmp = tmp.drop(columns='item') # 删掉item id列

        tmp = tmp.groupby('user').agg(['mean']) # groupby user

        users = vaex.from_pandas(users[['user']]).join(tmp, on='user', how='left').to_pandas_df() # user表 left join tr(groupby user agg mean)
        # 除了user id列以外，所有列改为agg列
        users = users.rename(columns={
            c: f'user_ohe_agg_{c}' for c in users.columns if c != 'user'
        })

        users = users.sort_values(by='user').reset_index(drop=True) # 根据user id排序

        save_path = f'{data_dir}/user_features/user_ohe_agg_week{week}_{c}.pkl' # 保存路径
        users.to_pickle(save_path) # 保存user agg数据
        print("saved", save_path)


# 创建不同条件的user agg数据
for week in range(14):
    create_user_ohe_agg(week)
