In [1]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
from tqdm import tqdm
from utils import Interactions
import os
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

# 设置是否使用隐式反馈
IMPLICT=True
# 设置是否使用超小数据集测试
SMALL=False

# for reproducibility
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# To compute probalities
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def getDataLoader(data_path, batch_size=2048):
    # load train data
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
    # all data file
    data_df = pd.read_table(data_path, names=data_fields)
    if SMALL:
        data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False)
    if IMPLICT:
        data_df.rating = (data_df.rating >= 5).astype(np.float32)
    le = preprocessing.LabelEncoder()
    le.fit(data_df['user_id'])
    data_df['user_id']=le.transform(data_df['user_id'])
    le.fit(data_df['item_id'])
    data_df['item_id']=le.transform(data_df['item_id'])

    df_train = data_df.sample(n=int(len(data_df) * 0.8), replace=False)
    df_test = data_df.drop(df_train.index, axis=0)

    # get user number
    n_users = max(data_df['user_id'].values)+1
    # get item number
    n_items = max(data_df['item_id'].values)+1

    print("Initialize end.The user number is:%d,item number is:%d" % (n_users, n_items))
    train_loader = data.DataLoader(
        Interactions(df_train), batch_size=batch_size, shuffle=True)

    test_loader = data.DataLoader(
        Interactions(df_test), batch_size=batch_size, shuffle=False)

    loaders = {'train': train_loader,
               'valid': test_loader}

    return (n_users,n_items ), loaders

#
class LFM(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=10, lr=0.01, weight_decay=0.01, sparse=False,topn=10, device=torch.device("cpu")):
        super(LFM, self).__init__()

        self.n_users = n_users
        self.n_items = n_items
        self.device = device
        self.topn=topn

        # get factor number
        self.n_factors = n_factors
        self.user_biases = nn.Embedding(self.n_users, 1, sparse=sparse)
        self.item_biases = nn.Embedding(self.n_items, 1, sparse=sparse)
        self.user_embeddings = nn.Embedding(self.n_users, self.n_factors, sparse=sparse)
        self.item_embeddings = nn.Embedding(self.n_items, self.n_factors, sparse=sparse)


        self.optimizer = torch.optim.SGD(self.parameters(),
                                   lr=lr, weight_decay=weight_decay)
        self=self.to(self.device)


    def forward(self, users, items):
        users=users.to(self.device)
        items = items.to(self.device)
        ues = self.user_embeddings(users)
        uis = self.item_embeddings(items)

        preds = self.user_biases(users) # b 1
        preds += self.item_biases(items)# b 1
        preds += ((ues) * (uis)).sum(dim=1,keepdim=True)
        # preds=torch.sigmoid(preds)

        return preds.squeeze(1)

    def fit(self, loaders, epochs=5):
        # training cycle
        for epoch in range(epochs):
            losses = {'train': 0., 'valid': 0}

            for phase in ['train', 'valid']:

                if phase == 'train':
                    self.train()
                else:
                    self.eval()
                pbar = tqdm(enumerate(loaders[phase]),
                            total=len(loaders[phase]),
                            desc='({0}:{1:^3})'.format(phase, epoch+1))
                for batch_idx, ((row, col), val) in pbar:
                    self.optimizer.zero_grad()

                    row = row.long()
                    col = col.long()
                    val = val.float().to(self.device)
                    preds = self.forward(row, col)
                    loss = nn.MSELoss(reduction='sum')(preds, val)

                    losses[phase] += loss.item()
                    batch_loss = loss.item() / row.size()[0]
                    pbar.set_postfix(train_loss=batch_loss)

                    with torch.set_grad_enabled(phase == 'train'):
                        if phase == 'train':
                            loss.backward()
                            self.optimizer.step()

                losses[phase] /= len(loaders[phase].dataset)

            # after each epoch check if we improved roc auc and if yes - save model
            with torch.no_grad():
                model.eval()

                y_pred,y_true = [],[]

                for ((row, col), val) in loaders['valid']:
                    row = row.long()
                    col = col.long()
                    val = val.float()
                    preds = self.forward(row, col)
                    if IMPLICT:
                        preds = sigmoid(preds.cpu().numpy())
                    y_pred += preds.tolist()
                    y_true += val.tolist()
                y_true,y_pred=np.array(y_true), np.array(y_pred)
                if IMPLICT:
                    epoch_score = roc_auc_score(y_true,y_pred)
                    score='auc'
                else:
                    epoch_score=sum([(y - x) ** 2 for x, y in zip(y_true, y_pred)]) / len(y_pred)
                    score='mse'


                user_item=loaders['valid'].dataset.user_item
                items = torch.arange(self.n_items).long()
                hit, rec_count, test_count,all_rec_items = 0,0,0,set()
                train_ui=loaders['train'].dataset.user_item
                for u in user_item:
                    target_items=user_item[u]
                    if u not in train_ui:continue
                    seen_items = np.array(list(train_ui[u].keys()))

                    users=[int(u)]*self.n_items
                    users = torch.Tensor(users).long()
                    scores=self.forward(users,items)
                    scores[seen_items]=-1e9
                    recs=np.argsort(scores)[-self.topn:].tolist()

                    for item in recs:  # 遍历给user推荐的物品
                        if item in target_items:  # 测试集中有该物品
                            hit += 1  # 推荐命中+1
                        all_rec_items.add(item)
                    rec_count += self.topn
                    test_count += len(target_items)
                    precision = hit / (1.0 * rec_count)
                    recall = hit / (1.0 * test_count)
                    coverage = len(all_rec_items) / (1.0 * self.n_items)

                # # 计算top10的recall、precision、推荐物品覆盖率
                # user_item=loaders['valid'].dataset.user_item
                # items = torch.arange(self.n_items).long().to(self.device)
                # hit, rec_count, test_count,all_rec_items = 0,0,0,set()
                # train_ui=loaders['train'].dataset.user_item
                # for u in user_item:
                #     target_items=user_item[u]
                #
                #     users=[int(u)]*self.n_items
                #     users = torch.Tensor(users).long().to(self.device)
                #     scores=self.forward(users,items)
                #     if u in train_ui:
                #         seen_items = np.array(list(train_ui[u].keys()))
                #
                #         scores[seen_items]=-1e9
                #     else:continue
                #     # print('s',len(seen_items))
                #     # seen_items = np.array(list(train_ui[u].keys()))
                #     # scores[seen_items] = -1e9
                #     # print('t',len(seen_items))
                #     recs=np.argsort(scores)[-self.topn:].tolist()
                #     print('------------')
                #     print(seen_items)
                #     print(recs)
                #     print(scores[recs])
                #
                #     for item in recs:  # 遍历给user推荐的物品
                #         if item in target_items:  # 测试集中有该物品
                #             hit += 1  # 推荐命中+1
                #         all_rec_items.add(item)
                #     rec_count += self.topn
                #     test_count += len(target_items)
                # precision = hit / (1.0 * rec_count)
                # recall = hit / (1.0 * test_count)
                # coverage = len(all_rec_items) / (1.0 * self.n_items)

            if ((epoch + 1) % 1) == 0:
                print(
                    f'epoch {epoch + 1} train loss: {losses["train"]:.3f} valid loss {losses["valid"]:.3f} {score} {epoch_score:.3f}')
                print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))
                print(hit, len(all_rec_items), len(user_item))

        return




In [2]:
if __name__ == '__main__':
    input_size, loader=getDataLoader("../data/ml-100k/u.data")
    model = LFM(input_size[0],input_size[1])
    model.fit(loader,2)

(train: 1 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

Initialize end.The user number is:943,item number is:1682


(train: 1 ): 100%|███████████████████████████████████████████████████| 40/40 [00:36<00:00,  1.11it/s, train_loss=0.724]
(valid: 1 ): 100%|███████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.14it/s, train_loss=0.821]
(train: 2 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

epoch 1 train loss: 2.697 valid loss 0.770 auc 0.593
precisioin=0.0016	recall=0.0008	coverage=0.3216
15 541 941


(train: 2 ): 100%|███████████████████████████████████████████████████| 40/40 [00:36<00:00,  1.10it/s, train_loss=0.433]
(valid: 2 ): 100%|███████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.16it/s, train_loss=0.518]


epoch 2 train loss: 0.397 valid loss 0.487 auc 0.641
precisioin=0.0013	recall=0.0006	coverage=0.3115
12 524 941


In [20]:
# 读取数据
data_path="../data/ml-100k/u.data"
data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
data_df = pd.read_table(data_path, names=data_fields)
data_df.rating = (data_df.rating >= 5).astype(np.float32)

In [22]:
data_df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,0.21201,883528900.0
std,266.61442,330.798356,0.408734,5343856.0
min,1.0,1.0,0.0,874724700.0
25%,254.0,175.0,0.0,879448700.0
50%,447.0,322.0,0.0,882826900.0
75%,682.0,631.0,0.0,888260000.0
max,943.0,1682.0,1.0,893286600.0


In [4]:
loaders=loader

In [5]:
user_item=loaders['valid'].dataset.user_item
items = torch.arange(model.n_items).long()
hit, rec_count, test_count,all_rec_items = 0,0,0,set()
train_ui=loaders['train'].dataset.user_item

In [7]:
len(user_item),len(train_ui)

(941, 943)

In [8]:
u=list(user_item.keys())[0]
u

243

In [9]:
target_items=user_item[u]
seen_items = np.array(list(train_ui[u].keys()))

users=[int(u)]*model.n_items
users = torch.Tensor(users).long()
scores=model.forward(users,items)

In [10]:
seen_items

array([ 207,  216,  240,  549,  143,  356, 1108,  104,   41,  379,   21,
        117,  630,   65,  865,   88,  746,    6,  761,    0,  948,  382,
        178,  116,   91,  213,  167,  166,   67,  293,   27, 1224, 1131,
        467,  832,  745,   69,  368,  317, 1078,  275,  454,  753,  400,
        196, 1044,  171,  180,  179,   87,   79,  267,  722,  185,  885,
        684, 1053,  163, 1073,  923,  182,  520,  672,  762, 1118,   70,
        558,  245,  527,  234,  264,   57,  366,   52,  723,  231,  507,
        108,  650, 1135,  455,  583,  280,  450,  134, 1208,  580,  172,
        120,   68,  409,  763, 1117,  536,  661,  508,  457,  290,  952,
         94,  161,  155,  233,  364,   31,   89,  187,   51,   61,  595,
        432,  940,  199,  215,   30,  711,  157,   19,  208,    8, 1052,
        277,   55,   96,  170,   66,  817,   53,  738, 1097, 1106,  221,
       1187,  192,   71,  734,  286, 1466,  742,  203,  316, 1167,  954,
       1177,  113,   12,  392,    2,  474,   49,  5

In [18]:
target_items=list(target_items.keys())

In [14]:
scores[seen_items].max()

tensor(0.6737, grad_fn=<MaxBackward1>)

In [19]:
scores[target_items]

tensor([ 0.0770,  0.0970,  0.1671,  0.1751,  0.1343,  0.0993, -0.1862, -0.1635,
         0.6713,  0.1268,  0.4818,  0.4556,  0.1706,  0.0555,  0.0205,  0.3254,
         0.1514,  0.2253,  0.0911,  0.4168,  0.2744, -0.5052,  0.1759,  0.0712,
         0.4073, -0.0306, -0.0708,  0.1237, -0.0089,  0.3230,  0.2754,  0.2973,
         0.3637,  0.5082,  0.7002,  0.2129,  0.2120,  0.2499, -0.0483,  0.7151,
         0.1539,  0.3075,  0.2028, -0.0739,  0.0500,  0.2368,  0.0418, -0.0630,
        -0.2243,  0.2711], grad_fn=<IndexBackward>)

In [None]:

for u in user_item:
    target_items=user_item[u]
    if u not in train_ui:continue
    seen_items = np.array(list(train_ui[u].keys()))

    users=[int(u)]*model.n_items
    users = torch.Tensor(users).long()
    scores=model.forward(users,items)
    scores[seen_items]=-1e9
    recs=np.argsort(scores)[-model.topn:].tolist()

    for item in recs:  # 遍历给user推荐的物品
        if item in target_items:  # 测试集中有该物品
            hit += 1  # 推荐命中+1
        all_rec_items.add(item)
    rec_count += model.topn
    test_count += len(target_items)
    precision = hit / (1.0 * rec_count)
    recall = hit / (1.0 * test_count)
    coverage = len(all_rec_items) / (1.0 * model.n_items)