In [4]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset , dataloader

import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class NSVD(nn.Module):
    def __init__(self, num_items, factor):
        super(NSVD, self).__init__()
        self.Q = nn.Embedding(num_items, factor)
        self.X = nn.Embedding(num_items, factor)
        
        nn.init.normal_(self.Q.weight, std=0.01)
        nn.init.normal_(self.X.weight, std=0.01)
        
    def forward(self, item, item_list):
        N_emb = self.X(item_list)
        N_emb = torch.sum(N_emb, dim=0)
        item_emb = self.Q(item)
        
        output = torch.sum(item_emb * N_emb, dim = 1)
        if item_list.size(0) > 1:
            output /= item_list.size(0)
        
        return output 
        

In [6]:
rating = pd.read_csv('../data/clean.csv')

In [7]:
view = pd.read_csv('../data/total_view.csv')

In [8]:
rating_outfit = set(rating.outfit_id.unique())
view_outfit = set(view.outfit_id.unique()) 

valid_outfit = list(rating_outfit.union(view_outfit))

In [9]:
rating_session = set(rating.session_id.unique())
view_session = set(view.session_id.unique()) 

valid_session = list(rating_session.union(view_session))

In [10]:
session2idx = {s: i for i, s in enumerate(valid_session)}
idx2session = {i: s for i, s in enumerate(valid_session)}
outfit2idx = {s: i for i, s in enumerate(valid_outfit)}
idx2outfit = {i: s for i, s in enumerate(valid_outfit)}


In [11]:
view

Unnamed: 0,session_id,user_id,outfit_id,timestamp,view_type
0,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,0,79850,23-07-19 09:00:05,journey
1,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,0,89945,23-07-19 09:00:05,journey
2,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,0,85067,23-07-19 09:00:05,journey
3,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,0,81371,23-07-19 09:00:05,journey
4,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,0,71531,23-07-19 09:00:05,journey
...,...,...,...,...,...
29345,93233da9-9d08-49ac-a6a6-177a4c4fb81f,0,82692,23-07-21 09:59:32,journey
29346,93233da9-9d08-49ac-a6a6-177a4c4fb81f,0,89698,23-07-21 09:59:32,journey
29347,93233da9-9d08-49ac-a6a6-177a4c4fb81f,0,73424,23-07-21 09:59:32,journey
29348,93233da9-9d08-49ac-a6a6-177a4c4fb81f,0,79117,23-07-21 09:59:32,journey


In [12]:
rating

Unnamed: 0,session_id,user_id,outfit_id,timestamp
0,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83783,23-07-19 09:26:07
1,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83800,23-07-19 09:26:10
2,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83791,23-07-19 09:26:12
3,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,84029,23-07-19 09:26:16
4,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83706,23-07-19 09:26:18
...,...,...,...,...
1694,22af75f5-cabe-46e8-90b3-08613327f389,,85210,2023-07-21 09:18:46.024456
1695,22af75f5-cabe-46e8-90b3-08613327f389,,83291,2023-07-21 09:18:48.784861
1696,22af75f5-cabe-46e8-90b3-08613327f389,,91601,2023-07-21 09:20:03.409300
1697,22af75f5-cabe-46e8-90b3-08613327f389,,90043,2023-07-21 09:20:13.284585


In [13]:
rating['rating'] = 1 
view['rating'] = 0 
df = pd.concat([rating[['session_id','outfit_id','rating']], view[['session_id','outfit_id','rating']]])

In [14]:
df['session_id'] = df['session_id'].map(session2idx)
df['outfit_id'] = df['outfit_id'].map(outfit2idx)
df

Unnamed: 0,session_id,outfit_id,rating
0,243,4274,1
1,243,4290,1
2,243,4281,1
3,243,4412,1
4,243,4221,1
...,...,...,...
29345,110,3588,0
29346,110,7707,0
29347,110,641,0
29348,110,1679,0


In [17]:
def prepare_data_split(df):
    pos_df = df.loc[df['rating']==1]
    neg_df = df.loc[df['rating']==0]
    
    all_item = df.outfit_id.unique()
    
    train_set_item = []
    train_set_user = []

    test_set_item = []
    test_set_user = []
    
    pos_dict = {}
    neg_dict = {}
    
    for session in df.session_id.unique():
        
        pos_item_list = pos_df.loc[pos_df['session_id']==session].outfit_id.tolist()
        neg_item_list = neg_df.loc[neg_df['session_id']==session].outfit_id.tolist()
        
        pos_dict[session] = pos_item_list
        neg_dict[session] = neg_item_list
        
        train_item = pos_item_list[:-1]
        train_user = [session] * len(train_item)
        
        user_neg_candidate = list(set(all_item) - set(pos_item_list))
        
        test_item = [pos_item_list[-1]] + np.random.choice(user_neg_candidate, 99, replace = False).tolist()
        test_user = [session] * len(test_item)
        
        train_set_item += train_item
        train_set_user += train_user
        
        test_set_item += test_item
        test_set_user += test_user
        
    return train_set_item, train_set_user, test_set_item, test_set_user, pos_dict, neg_dict
        
        

In [19]:
pos_df = df.loc[df['rating']==1]
neg_df = df.loc[df['rating']==0]

all_item = df.outfit_id.unique()

train_set_item = []
train_set_user = []

test_set_item = []
test_set_user = []

pos_dict = {}
neg_dict = {}

for session in df.session_id.unique():
    
    pos_item_list = pos_df.loc[pos_df['session_id']==session].outfit_id.tolist()
    neg_item_list = neg_df.loc[neg_df['session_id']==session].outfit_id.tolist()
    
    pos_dict[session] = pos_item_list
    neg_dict[session] = neg_item_list
    
    train_item = pos_item_list[:-1]
    train_user = [session] * len(train_item)
    
    user_neg_candidate = list(set(all_item) - set(pos_item_list))
    
    test_item = [pos_item_list[-1]] + np.random.choice(user_neg_candidate, 99, replace = False).tolist()
    test_user = [session] * len(test_item)
    
    train_set_item += train_item
    train_set_user += train_user
    
    test_set_item += test_item
    test_set_user += test_user

IndexError: list index out of range

In [18]:
train_set_item, train_set_user, test_set_item, test_set_user, pos_dict, neg_dict = prepare_data_split(df)

IndexError: list index out of range

In [115]:
class Journey(Dataset):
    def __init__(self, df):
        self.df = df
        self.positive_df = df[df['rating'] == 1]
        self.negative_df = df[df['rating'] == 0]
        
        self.all_items = [i for i in range(self.df.outfit_id.max()+1)]
        
        self.get_test()
        
        self.session_id = self.positive_df.session_id.values
        self.outfit_id = self.positive_df.outfit_id.values
        
        self.get_dict()
        
        self.test_session_id = self.positive_test_df.session_id.values
        self.test_outfit_id = self.positive_test_df.outfit_id.values
        
        
    def __len__(self):
        return len(self.positive_df)
    
    def __getitem__(self, idx):
        session_id = self.session_id[idx]
        outfit_id = self.outfit_id[idx]
        
        return session_id, outfit_id
    
    def get_positive(self, session_id):
        return self.pos_dict[session_id]
    
    def get_negative(self, session_id):
        return self.neg_dict[session_id]
    
    def get_dict(self):
        self.pos_dict, self.neg_dict = {}, {}
        for session_id in self.session_id:
            self.pos_dict[session_id] = self.positive_df[self.positive_df['session_id'] == session_id]['outfit_id'].values
            self.neg_dict[session_id] = self.negative_df[self.negative_df['session_id'] == session_id]['outfit_id'].values
            
    def get_test(self):
        test_index = self.positive_df.duplicated(subset=['session_id'], keep='last')
        self.positive_test_df = self.positive_df[test_index]
        self.positive_df = self.positive_df[~test_index]
        
    def get_user_neg_candidate(self):
        user_candidate = {}
        for session_id in self.positive_df['session_id'].unique():
            movieId_li = self.positive_df[self.positive_df['session_id'] == session_id]['outfit_id'].tolist()
            movieId_li = [movieId for movieId in movieId_li]
            user_candidate[session_id] = list(set(self.all_items) - set(movieId_li))
        
        return user_candidate


In [116]:
class BPR_Loss(nn.Module):
    def __init__(self):
        super(BPR_Loss, self).__init__()
    
    def forward(self, pos, neg):
        bpr_loss = -torch.mean(torch.log(torch.sigmoid(pos - neg)))
        return bpr_loss

In [117]:
journeydataset = Journey(df)
train_dataloader = dataloader.DataLoader(journeydataset, batch_size=1, shuffle=True)

In [118]:
model = NSVD(len(valid_outfit), 16) 

optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
device = torch.device('cpu')
model = model.to(device)
criterion = BPR_Loss()

In [119]:
def hit(target_item, pred_items):
    if target_item in pred_items:
        return 1
    return 0

# def ndcg(target_item, pred_items):
#     if target_item in pred_items:
#         idx = pred_items.index(target_item)
#         # 초기 인덱스가 0이기 때문에 +2 함
#         return np.reciprocal(np.log2(idx + 2))
#     return 0

def metrics(model, test_loader, top_k):
    model.eval()
    HR = []
    with torch.no_grad():
        for user, item, _ in test_loader:
            user = user.to(device)
            item = item.to(device)

            predictions = model(user, item)
            # 가장 높은 top_k개 선택
            _, indices = torch.topk(predictions, top_k)
            # 해당 상품 index 선택
            recommends = torch.take(item, indices).cpu().numpy().tolist()
            # 정답값 선택
            target_item = item[0].item()
            HR.append(hit(target_item, recommends))

    return np.mean(HR)

In [97]:
def train(model, train_loader, criterion, optimizer, device, dataset, epochs):
    
    for epoch in range(epochs):
        model.train()
    
        train_loss = 0 
    
        for session_id, outfit_id in train_loader:
            
            outfit_id = outfit_id.to(device)
            
            positive_item = torch.LongTensor(dataset.get_positive(int(session_id))).to(device)
            negative_item = torch.LongTensor(dataset.get_negative(int(session_id))).to(device)
            
            pos = model(outfit_id, positive_item)
            neg = model(outfit_id, negative_item)
            
            optimizer.zero_grad()
            
            loss = criterion(pos, neg)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        train_loss = train_loss/len(train_loader)
        
        for 
            

        
        print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
        
            


In [98]:
torch.autograd.set_detect_anomaly(True)


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fe9aeb56dc0>

In [99]:
train(model, train_dataloader, criterion, optimizer, device, journeydataset, 500)

Epoch: 1 	Training Loss: 0.693146
Epoch: 2 	Training Loss: 0.693093
Epoch: 3 	Training Loss: 0.693028
Epoch: 4 	Training Loss: 0.692931
Epoch: 5 	Training Loss: 0.692776
Epoch: 6 	Training Loss: 0.692517
Epoch: 7 	Training Loss: 0.692071
Epoch: 8 	Training Loss: 0.691297
Epoch: 9 	Training Loss: 0.689941
Epoch: 10 	Training Loss: 0.687596
Epoch: 11 	Training Loss: 0.683700
Epoch: 12 	Training Loss: 0.677517
Epoch: 13 	Training Loss: 0.668110
Epoch: 14 	Training Loss: 0.654217
Epoch: 15 	Training Loss: 0.634444
Epoch: 16 	Training Loss: 0.607401
Epoch: 17 	Training Loss: 0.572353
Epoch: 18 	Training Loss: 0.529064
Epoch: 19 	Training Loss: 0.478618
Epoch: 20 	Training Loss: 0.423219
Epoch: 21 	Training Loss: 0.365824
Epoch: 22 	Training Loss: 0.309588
Epoch: 23 	Training Loss: 0.257317
Epoch: 24 	Training Loss: 0.211105
Epoch: 25 	Training Loss: 0.171870
Epoch: 26 	Training Loss: 0.139622
Epoch: 27 	Training Loss: 0.113713
Epoch: 28 	Training Loss: 0.093185
Epoch: 29 	Training Loss: 0.0

KeyboardInterrupt: 