In [None]:
# 针对两条元路径，分别进行矩阵分解，然后生成UI.pkl和UIUI.pkl

In [1]:
import numpy as np
import pandas as pd
data_path="hin/data/u.data"
data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
# all data file
data_df = pd.read_table(data_path, names=data_fields)

# get user number
n_users = max(data_df['user_id'].values)
# get item number
n_items = max(data_df['item_id'].values)

In [2]:
from scipy.sparse import coo_matrix
# S = dok_matrix((5, 5), dtype=np.float32)
data = np.ones((data_df.shape[0]))
data=data_df.rating.values
row = data_df.user_id-1
col = data_df.item_id-1
UI = coo_matrix((data, (row, col)), shape=(n_users, n_items))
UIUI = UI.dot(UI.transpose()).dot(UI)

In [3]:
UIUI=UIUI.tocoo()
UI=UI.tocoo()

In [12]:
def save_M(M, M_str):
    df=pd.DataFrame()
    df['row']=M.row
    df['col']=M.col
    df['data']=M.data.astype(np.int)
    df.to_csv(M_str,header=None,index=False,sep='\t')
    
save_M(UIUI,'hin/UIUI.txt')
save_M(UI,'hin/UI.txt')

In [4]:
import random
import math
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
import torch.utils.data as data
from tqdm import tqdm
from utils import Interactions
import os
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
from IPython import embed

IMPLICT=False
SMALL=False

# for reproducibility
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# To compute probalities
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def drop_df(df):
    pos_cnt = df.groupby('user_id', as_index=False)['rating'].agg({"pos_cnt": 'sum'})
    tot_cnt = df.groupby('user_id', as_index=False)['rating'].agg({"tot_cnt": 'count'})
    df = pd.merge(df, pos_cnt, on=['user_id'], how='left')
    df = pd.merge(df, tot_cnt, on=['user_id'], how='left')
    df = df[(df.pos_cnt > 0) & (df.tot_cnt > df.pos_cnt)]
    df = df.drop(['pos_cnt', 'tot_cnt'], axis=1)
    return df

def getDataLoader(data_path, batch_size=2048):
    # load train data
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
    # all data file
    data_df = pd.read_table(data_path, names=data_fields)
    if SMALL:
        data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False)
    if IMPLICT:
        data_df.rating = (data_df.rating >= 4).astype(np.float32)
    # ua_base = allData.sample(n=90570, replace=False)
    df_train = data_df.sample(n=int(len(data_df) * 0.8), replace=False)
    df_test = data_df.drop(df_train.index, axis=0)
    if IMPLICT:
        df_train=drop_df(df_train)
        df_test = drop_df(df_test)
    # get user number
    n_users = max(set(data_df['user_id'].values))+1
    # get item number
    n_items = max(set(data_df['item_id'].values))+1

    print("Initialize end.The user number is:%d,item number is:%d" % (n_users, n_items))
    train_loader = data.DataLoader(
        Interactions(df_train,index_from_one=False), batch_size=batch_size, shuffle=True)

    test_loader = data.DataLoader(
        Interactions(df_test,index_from_one=False), batch_size=batch_size, shuffle=False)

    loaders = {'train': train_loader,
               'valid': test_loader}

    return (n_users,n_items ), loaders

class LFM(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20, lr=0.1, weight_decay=0.001, sparse=False,topn=10, device=torch.device("cpu")):
        super(LFM, self).__init__()

        self.n_users = n_users
        self.n_items = n_items
        self.device = device
        self.topn=topn

        # get factor number
        self.n_factors = n_factors
#         self.user_biases = nn.Embedding(self.n_users, 1, sparse=sparse)
#         self.item_biases = nn.Embedding(self.n_items, 1, sparse=sparse)
        self.user_embeddings = nn.Embedding(self.n_users, self.n_factors, sparse=sparse)
        self.item_embeddings = nn.Embedding(self.n_items, self.n_factors, sparse=sparse)

        self.sparse = sparse

        self.optimizer = torch.optim.Adam(self.parameters(),
                                   lr=lr, weight_decay=0.5)
        self=self.to(self.device)


    def forward(self, users, items):
        users=users.to(self.device)
        items = items.to(self.device)
        try:
            ues = self.user_embeddings(users)
            uis = self.item_embeddings(items)

#             preds = self.user_biases(users) # b 1
#             preds += self.item_biases(items)# b 1
            # preds += (self.dropout(ues) * self.dropout(uis)).sum(dim=1, keepdim=True)
            preds= ((ues) * (uis)).sum(dim=1, keepdim=True)
        except Exception as ex:
            print(ex)
            embed()
            
        return preds.squeeze()

    def fit(self, loaders, epochs=5):
        # training cycle
        best_score = 0.
        for epoch in range(epochs):
            losses = {'train': 0., 'valid': 0}

            for phase in ['train', 'valid']:

                if phase == 'train':
                    self.train()
                else:
                    self.eval()
                pbar = tqdm(enumerate(loaders[phase]),
                            total=len(loaders[phase]),
                            desc='({0}:{1:^3})'.format(phase, epoch+1))
                for batch_idx, ((row, col), val) in pbar:
                # for batch_x, batch_y in loaders[phase]:
                    self.optimizer.zero_grad()

                    row = row.long()
                    col = col.long()
                    val = val.float().to(self.device)
                    preds = self.forward(row, col)
                    loss = nn.MSELoss(reduction='sum')(preds, val)

                    losses[phase] += loss.item()
                    batch_loss = loss.item() / row.size()[0]
                    pbar.set_postfix(train_loss=batch_loss)

                    with torch.set_grad_enabled(phase == 'train'):
                        if phase == 'train':
                            loss.backward()
                            #                             scheduler.step()
                            self.optimizer.step()

                losses[phase] /= len(loaders[phase].dataset)
            # print('epoch done')
            # after each epoch check if we improved roc auc and if yes - save model
            with torch.no_grad():
                model.eval()

                y_pred,y_true = [],[]

                for ((row, col), val) in loaders['valid']:
                    row = row.long()
                    col = col.long()
                    val = val.float()
                    preds = self.forward(row, col)
                    if IMPLICT:
                        preds = sigmoid(preds.cpu().numpy())
                    y_pred += preds.tolist()
                    y_true += val.tolist()
                y_true,y_pred=np.array(y_true), np.array(y_pred)
                if IMPLICT:
                    epoch_score = roc_auc_score(y_true,y_pred)
                    score='auc'
                else:
                    epoch_score=sum([(y - x) ** 2 for x, y in zip(y_true, y_pred)]) / len(y_pred)
                    score='mse'

                # 计算top10的recall、precision、推荐物品覆盖率
                user_item=loaders['valid'].dataset.user_item
                items = torch.arange(self.n_items).long()
                hit, rec_count, test_count,all_rec_items = 0,0,0,set()
                train_ui=loaders['train'].dataset.user_item
                for u in user_item:
                    target_items=user_item[u]
                    # seen_items = np.array(list(train_ui[u].keys()))

                    users=[int(u)]*self.n_items
                    users = torch.Tensor(users).long()
                    scores=self.forward(users,items)
                    if u in train_ui:
                        seen_items = np.array(list(train_ui[u].keys()))
                        scores[seen_items]=-1e9
                    recs=np.argsort(scores)[-10:].tolist()

                    for item in recs:  # 遍历给user推荐的物品
                        if item in target_items:  # 测试集中有该物品
                            hit += 1  # 推荐命中+1
                        all_rec_items.add(item)
                    rec_count += self.topn
                    test_count += len(target_items)
                    precision = hit / (1.0 * rec_count)
                    recall = hit / (1.0 * test_count)
                    coverage = len(all_rec_items) / (1.0 * self.n_items)


            if ((epoch + 1) % 1) == 0:
                print(
                    f'epoch {epoch + 1} train loss: {losses["train"]:.3f} valid loss {losses["valid"]:.3f} {score} {epoch_score:.3f}')
                print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))

            # if ((epoch + 1) % 1) == 0:
            #     print(
            #         f'epoch {epoch + 1} train loss: {losses["train"]:.3f} valid loss {losses["valid"]:.3f}')
        return

In [2]:
# from lfm_new_data import getDataLoader,LFM
import numpy as np
import pandas as pd

from utils import Interactions
import torch
input_size, loader=getDataLoader("hin/UI.txt", batch_size=2048)


Initialize end.The user number is:943,item number is:1682


In [3]:
model = LFM(input_size[0],input_size[1])
model.fit(loader,5)

(train: 1 ): 100%|████████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.13it/s, train_loss=3.56]
(valid: 1 ): 100%|█████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.14it/s, train_loss=3.1]
(train: 2 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

epoch 1 train loss: 15.878 valid loss 3.240 mse 3.240
precisioin=0.0704	recall=0.0331	coverage=0.4328


(train: 2 ): 100%|████████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.13it/s, train_loss=1.41]
(valid: 2 ): 100%|████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s, train_loss=1.31]
(train: 3 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

epoch 2 train loss: 1.491 valid loss 1.296 mse 1.296
precisioin=0.0989	recall=0.0466	coverage=0.2771


(train: 3 ): 100%|████████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.12it/s, train_loss=1.07]
(valid: 3 ): 100%|████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s, train_loss=1.13]
(train: 4 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

epoch 3 train loss: 0.939 valid loss 1.103 mse 1.103
precisioin=0.1115	recall=0.0524	coverage=0.1683


(train: 4 ): 100%|███████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.13it/s, train_loss=0.972]
(valid: 4 ): 100%|████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.14it/s, train_loss=1.07]
(train: 5 ):   0%|                                                                              | 0/40 [00:00<?, ?it/s]

epoch 4 train loss: 0.891 valid loss 1.065 mse 1.065
precisioin=0.1290	recall=0.0607	coverage=0.1350


(train: 5 ): 100%|████████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.14it/s, train_loss=1.15]
(valid: 5 ): 100%|████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s, train_loss=1.15]


epoch 5 train loss: 0.885 valid loss 1.053 mse 1.053
precisioin=0.1491	recall=0.0702	coverage=0.1249


In [5]:
pd.to_pickle(model,'UI.pkl')

In [6]:
# from lfm_new_data import getDataLoader,LFM
import numpy as np
import pandas as pd

from utils import Interactions
import torch
input_size, loader=getDataLoader("hin/UIUI.txt", batch_size=2048)


Initialize end.The user number is:943,item number is:1682


In [7]:
model = LFM(input_size[0],input_size[1],n_factors=5)
model.fit(loader,5)

(train: 1 ): 100%|██████████████████████████████████████████████| 620/620 [09:32<00:00,  1.08it/s, train_loss=6.07e+10]
(valid: 1 ): 100%|██████████████████████████████████████████████| 155/155 [02:22<00:00,  1.08it/s, train_loss=2.08e+10]
(train: 2 ):   0%|                                                                             | 0/620 [00:00<?, ?it/s]

epoch 1 train loss: 40297200151.465 valid loss 34246581271.081 mse 34246581319.054
precisioin=1.0000	recall=0.0297	coverage=0.0583


(train: 2 ): 100%|██████████████████████████████████████████████| 620/620 [09:35<00:00,  1.08it/s, train_loss=1.46e+10]
(valid: 2 ): 100%|███████████████████████████████████████████████| 155/155 [02:22<00:00,  1.09it/s, train_loss=9.99e+9]
(train: 3 ):   0%|                                                                             | 0/620 [00:00<?, ?it/s]

epoch 2 train loss: 26704211269.207 valid loss 20691389666.063 mse 20691389648.508
precisioin=1.0000	recall=0.0297	coverage=0.0606


(train: 3 ): 100%|████████████████████████████████████████████████| 620/620 [09:30<00:00,  1.09it/s, train_loss=5.5e+9]
(valid: 3 ): 100%|███████████████████████████████████████████████| 155/155 [02:21<00:00,  1.10it/s, train_loss=4.77e+9]
(train: 4 ):   0%|                                                                             | 0/620 [00:00<?, ?it/s]

epoch 3 train loss: 15684147468.663 valid loss 12014367207.357 mse 12014367187.675
precisioin=1.0000	recall=0.0297	coverage=0.0606


(train: 4 ): 100%|███████████████████████████████████████████████| 620/620 [09:39<00:00,  1.07it/s, train_loss=7.65e+9]
(valid: 4 ): 100%|███████████████████████████████████████████████| 155/155 [02:22<00:00,  1.09it/s, train_loss=2.53e+9]
(train: 5 ):   0%|                                                                             | 0/620 [00:00<?, ?it/s]

epoch 4 train loss: 8986983976.212 valid loss 6915117062.965 mse 6915117083.236
precisioin=1.0000	recall=0.0297	coverage=0.0618


(train: 5 ): 100%|███████████████████████████████████████████████| 620/620 [09:56<00:00,  1.04it/s, train_loss=3.99e+9]
(valid: 5 ): 100%|███████████████████████████████████████████████| 155/155 [02:27<00:00,  1.05it/s, train_loss=1.51e+9]


epoch 5 train loss: 5120800658.736 valid loss 3981462963.339 mse 3981462974.818
precisioin=1.0000	recall=0.0297	coverage=0.0612


In [None]:
pd.to_pickle(model,'UIUI.pkl')