### Train / Val Data Preprocessing
- Movie Lens data 전처리

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import torch
torch.cuda.empty_cache()

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
ndf = pd.read_csv('/content/drive/MyDrive/BOAZ/분석/BASE/MINI2/DATA//ratings.csv')

In [6]:
# 유저별로 아이템 개수가 너무 많으면 데이터의 행 개수가 너무 많아져서, 아이템 개수를 제한했음
temp = pd.DataFrame(ndf['userId'].value_counts())
temp['index']=temp.index
temp.reset_index(drop=True,inplace=True)
temp.columns = ['item_buy','userId']

final_temp = temp[temp['item_buy']<=21]
res = random.sample(list(final_temp['userId']), 6277)
sample_df = ndf[ndf['userId'].isin(res)]
sample_df.reset_index(inplace=True,drop=True)

num_user = sample_df['userId'].nunique()
num_item = sample_df['movieId'].nunique()

In [7]:
# 먼저 데이터셋 encoding
user_encoder, user_decoder = {}, {}

# label encoding이라고 생각
for idx, user_id in enumerate(sample_df['userId'].unique()):
    user_encoder[user_id] = idx
    user_decoder[idx] = user_id

# label encoding이라고 생각
item_encoder, item_decoder = {}, {}
for idx, item_id in enumerate(sample_df['movieId'].unique()):
    item_encoder[item_id] = idx
    item_decoder[idx] = item_id
    
# 추후에 Embedding 과정을 거쳐서 보다 dense하게 만들어주어야함.
##torch.nn.Embedding(num_embeddings=len(ratings_df['mbrNo']),embedding_dim = ratings_df['mbrNo'].nunqiue())

sample_df['en_userId'] = sample_df['userId'].apply(lambda x : user_encoder[x])
sample_df['en_movieId'] = sample_df['movieId'].apply(lambda x : item_encoder[x])
sample_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['en_userId'] = sample_df['userId'].apply(lambda x : user_encoder[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['en_movieId'] = sample_df['movieId'].apply(lambda x : item_encoder[x])


Unnamed: 0,userId,movieId,rating,timestamp,en_userId,en_movieId
0,51,1,4.0,1510742879,0,0
1,51,47,4.5,1510742884,0,1
2,51,50,2.5,1530548826,0,2
3,51,110,4.5,1510742881,0,3
4,51,150,3.5,1510742877,0,4


In [8]:
# 논문에서 나와있던 Train / Test data를 Split 하는 방법
# Train : 상호작용 O -> 유저별로 가장 최신 아이템을 제외한 나머지 아이템 = rating 1 / 상호작용 X -> 유저가 상호작용한 아이템을 제외한 아이템들 중에서 상호작용한 아이템 개수 * num_neg 만큼 rating 0
# Test : 상호작용 O -> 유저별로 가장 최신 아이템 = rating 1 / 상호작용 X -> 유저가 상호작용한 아이템을 제외한 아이템들 중에서 99개 추출
def split_data(df, num_neg = 2):
    total_item_li = set(df['en_movieId'])
    train_df = []
    test_df = []
    en_user_id_li = df['en_userId'].unique()
    for en_user_id in tqdm(en_user_id_li):
        pos_recomencder_li = df[df['en_userId'] == en_user_id]['en_movieId'].tolist()
        neg_recomencder_li = np.random.choice(list(total_item_li - set(pos_recomencder_li)), num_neg * len(pos_recomencder_li), replace = False).tolist()
        train_df += [[en_user_id, en_movieId, 1] for en_movieId in pos_recomencder_li[:-1]] + [[en_user_id, en_movieId, 0] for en_movieId in neg_recomencder_li]

        neg_recomencder_li = np.random.choice(list(total_item_li - set(pos_recomencder_li)), 99, replace = False).tolist()
        test_df += [[en_user_id, pos_recomencder_li[-1], 1]] + [[en_user_id, en_movieId, 0] for en_movieId in neg_recomencder_li]
    
    return train_df, test_df

train_df, val_df = split_data(sample_df)

100%|██████████| 6277/6277 [00:13<00:00, 448.88it/s]


In [9]:
train_df = pd.DataFrame(train_df); train_df.columns = ['userId','movieId','rating']
val_df = pd.DataFrame(val_df); val_df.columns = ['userId','movieId','rating']

In [11]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
from tqdm import tqdm

warnings.filterwarnings(action='ignore')

data_dir = './data'
model_dir = '/content/drive/MyDrive/BOAZ/분석/BASE/MINI2/model_path2/'

In [12]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = self.df.loc[idx][0].astype(int)
        item = self.df.loc[idx][1].astype(int)
        label = self.df.loc[idx][2]

        return user, item, label

### GMF

- GMF와 MLP의 embedding layer는 서로 다른 가중치로 학습. -> flexiblity up!

In [13]:
# GMF는 user
class GMF(nn.Module):
    def __init__(self, num_user, num_item, num_factor):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_user, num_factor) # embedding layer 통과 -> 보다 dense하게 만들어줌.
        self.item_emb = nn.Embedding(num_item, num_factor)
        
        self.predict_layer = nn.Sequential(
            nn.Linear(num_factor, 1, bias = False),
            nn.Sigmoid()
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01) # embedding layer 각각 정규분포 초기화
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1, nonlinearity="sigmoid") # nn.Linear는 he init 사용.
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)

        output = self.predict_layer(user_emb * item_emb) 

        return output.view(-1)

### MLP

In [14]:
class MLP(nn.Module):
    def __init__(self, num_user, num_item, num_factor, num_layers, dropout):
        super(MLP, self).__init__()
        self.dropout = dropout
        self.user_emb = nn.Embedding(num_user, num_factor)
        self.item_emb = nn.Embedding(num_item, num_factor)

        MLP_modules = []
        input_size = num_factor * 2
        for i in range(num_layers):
            MLP_modules.append(nn.Dropout(p = self.dropout))
            MLP_modules.append(nn.Linear(input_size, input_size // 2))
            MLP_modules.append(nn.ReLU())
            input_size = input_size // 2
        self.MLP_layers = nn.Sequential(*MLP_modules) # num_layers 개수만큼의 mlp layer들이 있음.

        self.predict_layer = nn.Sequential(
            nn.Linear(input_size, 1, bias = False),
            nn.Sigmoid()
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1, nonlinearity="sigmoid")
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        
        cat_emb = torch.cat((user_emb, item_emb), -1) # concat되어 들어감.

        output = self.MLP_layers(cat_emb) 

        output = self.predict_layer(output)

        return output.view(-1) # flatten
     

### NeuMF

In [15]:
# Neural MF는 GMF 와 MLP가 섞인 것.

class NeuMF(nn.Module):
    def __init__(self, GMF, MLP, num_factor,num_layers):
        super(NeuMF, self).__init__()
        self.gmf_user_emb = GMF.user_emb
        self.gmf_item_emb = GMF.item_emb

        self.mlp_user_emb = MLP.user_emb
        self.mlp_item_emb = MLP.item_emb

        self.mlp_layer = MLP.MLP_layers

        self.predict_layer = nn.Sequential(
            nn.Linear(num_factor + (num_factor // (2**(num_layers-1))), 1, bias = False),
            nn.Sigmoid(),
        ) # sigmoid function 통과 후 score 출력
        self._init_weight_()
    
    def _init_weight_(self):
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1, nonlinearity="sigmoid")

    def forward(self, user, item):
        gmf_user_emb = self.gmf_user_emb(user)
        gmf_item_emb = self.gmf_item_emb(item)
        gmf_output = gmf_user_emb * gmf_item_emb # GMF의 output

        mlp_user_emb = self.mlp_user_emb(user)
        mlp_item_emb = self.mlp_item_emb(item)
        mlp_cat_emb = torch.cat((mlp_user_emb, mlp_item_emb), -1)
        mlp_output = self.mlp_layer(mlp_cat_emb) # mlp의 output
        
        cat_output = torch.cat((gmf_output, mlp_output), -1) # mlp output, gmf ouptut concat

        output = self.predict_layer(cat_output) 

        return output.view(-1)

### Train

In [16]:
def hit(target_item, pred_items):
    if target_item in pred_items:
        return 1
    return 0

def ndcg(target_item, pred_items):
    if target_item in pred_items:
        idx = pred_items.index(target_item)
        # 초기 인덱스가 0이기 때문에 +2 함
        return np.reciprocal(np.log2(idx + 2))
    return 0

# test data로 성능 측정하는 함수
# HR과 NDCG를 사용.
# HR : 상위 K개의 아이템 중 실제 상호작용한 아이템을 맞춘 비율을 나타내는 지표.
# NDCG : 추천된 아이템 순위와 사용자의 관심도를 고려하여 값이 높을수록 성능이 좋음.
def metrics(model, test_loader, top_k):
    model.eval()
    HR, NDCG, pred_ratio, recommends = [], [],[],[]
    with torch.no_grad():
        for user, item, _ in test_loader:
            user = user.to(device)
            item = item.to(device)

            predictions = model(user, item)
            # 가장 높은 top_k개 선택
            _, indices = torch.topk(predictions, top_k)
            # 해당 상품 index 선택
            recommends = torch.take(item, indices).cpu().numpy().tolist()
            # 정답값 선택
            target_item = item[0].item()
            HR.append(hit(target_item, recommends))
            NDCG.append(ndcg(target_item, recommends))
            pred_ratio += list(predictions)
            recommends += list(recommends)

    return np.mean(HR), np.mean(NDCG), pred_ratio, recommends

# train 함수 구성
def train(model, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0

    for user, item, label in train_loader:
        user = user.to(device)
        item = item.to(device)
        label = label.to(device)
        label = label.float()

        optimizer.zero_grad() 
        output = model(user, item)
        loss = criterion(output, label)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    train_loss = train_loss / len(train_loader)
    
    # valid test
    model.eval()
    with torch.no_grad():
        for user, item, label in valid_loader:
            user = user.to(device)
            item = item.to(device)
            label = label.to(device)
            label = label.float()
            
            output = model(user, item)
            val_loss = criterion(output, label)
            
            val_loss += val_loss.item()
        valid_loss = val_loss / len(valid_loader)
            
    return train_loss, valid_loss

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 128
epochs = 40
lr = 0.005
num_factor = 64 
num_layers = 4 
dropout = 0.2
top_k = 10

Oliveyoung Dataset : Implicit Feedback 으로 전처리

In [None]:
ratings_df = pd.read_csv('/content/drive/MyDrive/BERT4Rec/olive_young_data.csv')
# 먼저 데이터셋 encoding
user_encoder, user_decoder = {}, {}

# label encoding이라고 생각
for idx, user_id in enumerate(ratings_df['mbrNo'].unique()):
    user_encoder[user_id] = idx
    user_decoder[idx] = user_id

# label encoding이라고 생각
item_encoder, item_decoder = {}, {}
for idx, item_id in enumerate(ratings_df['goodsNm'].unique()):
    item_encoder[item_id] = idx
    item_decoder[idx] = item_id
    
# 추후에 Embedding 과정을 거쳐서 보다 dense하게 만들어주어야함.
##torch.nn.Embedding(num_embeddings=len(ratings_df['mbrNo']),embedding_dim = ratings_df['mbrNo'].nunqiue())

ratings_df['en_userId'] = ratings_df['mbrNo'].apply(lambda x : user_encoder[x])
ratings_df['en_movieId'] = ratings_df['goodsNm'].apply(lambda x : item_encoder[x])
ratings_df_final = ratings_df
df = ratings_df[['en_userId','en_movieId','gdasScrVal']]

total_item_li = set(df['en_movieId'])
ratings_df = []
en_user_id_li = df['en_userId'].unique()
for en_user_id in tqdm(en_user_id_li):
    pos_recomencder_li = df[df['en_userId'] == en_user_id]['en_movieId'].tolist()
    neg_recomencder_li = np.random.choice(list(total_item_li - set(pos_recomencder_li)), 2 * len(pos_recomencder_li), replace = False).tolist()
    ratings_df += [[en_user_id, en_movieId, 1] for en_movieId in pos_recomencder_li] + [[en_user_id, en_movieId, 0] for en_movieId in neg_recomencder_li]

ratings_df = pd.DataFrame(ratings_df)
ratings_df.columns = ['en_userId','en_movieId','gdasScrVal']

test_df = ratings_df
test_df.columns = ['en_userid','en_productid','rating']
test_df.reset_index(drop=True,inplace=True)

100%|██████████| 6277/6277 [00:03<00:00, 1814.04it/s]


In [None]:
print('train_df: ','\n',train_df['rating'].value_counts(),'\n','val_df: ','\n',val_df['rating'].value_counts(),'\n','test_df: ','\n',test_df['rating'].value_counts())

train_df:  
 0    256386
1    121916
Name: rating, dtype: int64 
 val_df:  
 0    621423
1      6277
Name: rating, dtype: int64 
 test_df:  
 0    17816
1     8908
Name: rating, dtype: int64


In [None]:
train_dataset = CustomDataset(df = train_df)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, drop_last = False)

valid_dataset = CustomDataset(df = val_df)
valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = True, drop_last = False)

test_dataset = CustomDataset(df = test_df)
test_loader = DataLoader(test_dataset, batch_size = 100, shuffle = False, drop_last = False)
     

GMF & MLP Train / Validation data로 학습

In [None]:
gmf = GMF(num_user = num_user, num_item = num_item, num_factor = num_factor).to(device)
gmf_optimizer = torch.optim.Adam(gmf.parameters(), lr = lr)

mlp = MLP(num_user = num_user, num_item = num_item, num_factor = num_factor, num_layers = num_layers, dropout = dropout).to(device)
mlp_optimizer = torch.optim.Adam(mlp.parameters(), lr = lr)

loss_fc = nn.BCELoss() 

In [None]:
gmf_best_metric = 0
mlp_best_metric = 0

for epoch in range(1, epochs + 1):
    gmf_train_loss,gmf_valid_loss = train(model = gmf, train_loader = train_loader, criterion = loss_fc, optimizer = gmf_optimizer)
    gmf_hr, gmf_ndcg,a,b = metrics(model = gmf, test_loader = test_loader, top_k = top_k)

    mlp_train_loss,mlp_valid_loss = train(model = mlp, train_loader = train_loader, criterion = loss_fc, optimizer = mlp_optimizer)
    mlp_hr, mlp_ndcg,a,b  = metrics(model = mlp, test_loader = test_loader, top_k = top_k)

    print(f"[EPOCH: {epoch}], GMF Train Loss: {gmf_train_loss:.4f}, MLP Train Loss: {mlp_train_loss:.4f}, GMF Valid Loss : {gmf_valid_loss:.4f}, MLP Valid Loss : {mlp_valid_loss:.4f}")
    print(f"GMF HR: {gmf_hr:.4f}, MLP HR: {mlp_hr:.4f}, GMF NDCG: {gmf_ndcg:.4f}, MLP NDCG: {mlp_ndcg:.4f}")
    print('--------------------------------------------------------','\n')

    print(f"[EPOCH: {epoch}], GMF Train Loss: {gmf_train_loss:.4f}, GMF Valid Loss : {gmf_valid_loss:.4f}")
    print(f"GMF HR: {gmf_hr:.4f}, GMF NDCG: {gmf_ndcg:.4f}")
    print('--------------------------------------------------------','\n')


    if gmf_best_metric < gmf_ndcg:
        gmf_best_metric = gmf_ndcg
        torch.save(gmf.state_dict(), model_dir + f'GMF.pt')

    if mlp_best_metric < mlp_ndcg:
        mlp_best_metric = mlp_ndcg
        torch.save(mlp.state_dict(), model_dir + f'MLP.pt')

[EPOCH: 1], GMF Train Loss: 0.4394, GMF Valid Loss : 0.0001
GMF HR: 0.3060, GMF NDCG: 0.1792
-------------------------------------------------------- 

[EPOCH: 2], GMF Train Loss: 0.1511, GMF Valid Loss : 0.0001
GMF HR: 0.2985, GMF NDCG: 0.1706
-------------------------------------------------------- 

[EPOCH: 3], GMF Train Loss: 0.0171, GMF Valid Loss : 0.0002
GMF HR: 0.3172, GMF NDCG: 0.1804
-------------------------------------------------------- 

[EPOCH: 4], GMF Train Loss: 0.0017, GMF Valid Loss : 0.0008
GMF HR: 0.3209, GMF NDCG: 0.1810
-------------------------------------------------------- 

[EPOCH: 5], GMF Train Loss: 0.0002, GMF Valid Loss : 0.0005
GMF HR: 0.3358, GMF NDCG: 0.1781
-------------------------------------------------------- 

[EPOCH: 6], GMF Train Loss: 0.0000, GMF Valid Loss : 0.0006
GMF HR: 0.3470, GMF NDCG: 0.1789
-------------------------------------------------------- 

[EPOCH: 7], GMF Train Loss: 0.0000, GMF Valid Loss : 0.0002
GMF HR: 0.3433, GMF NDCG: 0.

### NeuMF 학습

In [28]:
# NeuMF 의 경우 pre-trained 모델을 사용한다고 함
# 그리고 optimizer로 SGD를 사용한다고 함
# 실제로 Adam 보다 더 좋은 성능을 보임

gmf = GMF(num_user = num_user, num_item = num_item, num_factor = num_factor).to(device)
gmf.load_state_dict(torch.load(model_dir + f'GMF.pt'))

mlp = MLP(num_user = num_user, num_item = num_item, num_factor = num_factor, num_layers = num_layers, dropout = dropout).to(device)
mlp.load_state_dict(torch.load(model_dir + f'MLP.pt'))

nmf = NeuMF(GMF = gmf, MLP = mlp, num_factor = num_factor,num_layers = num_layers).to(device)
nmf_optimizer = torch.optim.SGD(nmf.parameters(), lr = lr, momentum = 0.9)

In [None]:
nmf_best_metric = 0
nmf_train_loss_lst = []; nmf_valid_loss_lst = []; nmf_hr_lst = []; nmf_ndcg_lst = []
for epoch in range(1, epochs + 1):
    nmf_train_loss,nmf_valid_loss = train(model = nmf, train_loader = train_loader, criterion = loss_fc, optimizer = nmf_optimizer)
    nmf_hr, nmf_ndcg,a,b = metrics(model = nmf, test_loader = test_loader, top_k = top_k)

    nmf_train_loss_lst.append(nmf_train_loss);nmf_valid_loss_lst.append(nmf_valid_loss)
    nmf_hr_lst.append(nmf_hr); nmf_ndcg_lst.append(nmf_ndcg)
    print(f"[EPOCH: {epoch}], NeuMF Train Loss: {nmf_train_loss:.4f},NeuMF Valid Loss: {nmf_valid_loss:.4f} ,NeuMF HR: {nmf_hr:.4f}, NeuMF NDCG: {nmf_ndcg:.4f}")

    if nmf_best_metric < nmf_ndcg:
        nmf_best_metric = nmf_ndcg
        torch.save(nmf.state_dict(), model_dir + f'NeuMF.pt')

[EPOCH: 1], NeuMF Train Loss: 0.0795,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2948, NeuMF NDCG: 0.1554
[EPOCH: 2], NeuMF Train Loss: 0.0157,NeuMF Valid Loss: 0.0002 ,NeuMF HR: 0.2985, NeuMF NDCG: 0.1595
[EPOCH: 3], NeuMF Train Loss: 0.0093,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2985, NeuMF NDCG: 0.1616
[EPOCH: 4], NeuMF Train Loss: 0.0066,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2985, NeuMF NDCG: 0.1572
[EPOCH: 5], NeuMF Train Loss: 0.0052,NeuMF Valid Loss: 0.0002 ,NeuMF HR: 0.2985, NeuMF NDCG: 0.1583
[EPOCH: 6], NeuMF Train Loss: 0.0043,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2985, NeuMF NDCG: 0.1597
[EPOCH: 7], NeuMF Train Loss: 0.0036,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2948, NeuMF NDCG: 0.1595
[EPOCH: 8], NeuMF Train Loss: 0.0031,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2910, NeuMF NDCG: 0.1592
[EPOCH: 9], NeuMF Train Loss: 0.0028,NeuMF Valid Loss: 0.0001 ,NeuMF HR: 0.2910, NeuMF NDCG: 0.1589
[EPOCH: 10], NeuMF Train Loss: 0.0025,NeuMF Valid Loss: 0.0002 ,NeuMF HR: 0.2910, NeuMF NDCG: 0.1591

KeyboardInterrupt: ignored

### 평가

- 올리브영 데이터셋으로

In [None]:
oliveyoung_dataset = CustomDataset(df = test_df)
oliveyoung_loader = DataLoader(oliveyoung_dataset, batch_size = 1, shuffle = True, drop_last = False)

In [None]:
nmf = NeuMF(GMF = gmf, MLP = mlp, num_factor = num_factor,num_layers=num_layers).to(device)
nmf.load_state_dict(torch.load(model_dir + f'NeuMF.pt'))

gmf = GMF(num_user = num_user, num_item = num_item, num_factor = num_factor).to(device)
gmf.load_state_dict(torch.load(model_dir + f'GMF.pt'))

mlp = MLP(num_user = num_user, num_item = num_item, num_factor = num_factor, num_layers = num_layers, dropout = dropout).to(device)
mlp.load_state_dict(torch.load(model_dir + f'MLP.pt'))

gmf_hr, gmf_ndcg,predictions,recommend = metrics(model = gmf, test_loader = test_loader, top_k = top_k)
mlp_hr, mlp_ndcg,predictions,recommend  = metrics(model = mlp, test_loader = test_loader, top_k = top_k)
nmf_hr, nmf_ndcg,predictions,recommend = metrics(model = nmf, test_loader = test_loader, top_k = top_k)

print(f"NeuMF HR: {nmf_hr:.4f}, NeuMF NDCG: {nmf_ndcg:.4f}, \n MLP HR: {mlp_hr:.4f}, MLP NDCG: {mlp_ndcg:.4f} \n GMF HR: {gmf_hr:.4f}, GMF NDCG: {gmf_ndcg:.4f}")

NeuMF HR: 0.2985, NeuMF NDCG: 0.1616, 
 MLP HR: 0.3172, MLP NDCG: 0.1745 
 GMF HR: 0.3433, GMF NDCG: 0.1808


### Inference

Dataset 생성
- Inference 하기 위해서 한 user당 모든 아이템을 inference할 수 있도록 데이터프레임 구성

In [30]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/BOAZ/분석/BASE/MINI2/DATA/olive_young_data.csv')

In [31]:
user_list = []; item_list = []; rating_list = []
item_unique = list(df['goodsNm'].unique())
for _user in list(df['mbrNo'].unique()):
  rating_0 = list(df[df['mbrNo']==_user].goodsNm.unique())
  for _item in item_unique:
    if _item in rating_0:
      rating_list.append(1)
      user_list.append(_user)
      item_list.append(_item)
    else:
      rating_list.append(0)
      user_list.append(_user)
      item_list.append(_item)

In [32]:
final_df = pd.DataFrame({'mbrNo':user_list,'goodsNm':item_list,'gdasScrVal':rating_list})

In [33]:
# 먼저 데이터셋 encoding
user_encoder, user_decoder = {}, {}

# label encoding이라고 생각
for idx, user_id in enumerate(final_df['mbrNo'].unique()):
    user_encoder[user_id] = idx
    user_decoder[idx] = user_id

# label encoding이라고 생각
item_encoder, item_decoder = {}, {}
for idx, item_id in enumerate(final_df['goodsNm'].unique()):
    item_encoder[item_id] = idx
    item_decoder[idx] = item_id
    
# 추후에 Embedding 과정을 거쳐서 보다 dense하게 만들어주어야함.
##torch.nn.Embedding(num_embeddings=len(ratings_df['mbrNo']),embedding_dim = ratings_df['mbrNo'].nunqiue())

final_df['en_mbrNo'] = final_df['mbrNo'].apply(lambda x : user_encoder[x])
final_df['en_goodsNm'] = final_df['goodsNm'].apply(lambda x : item_encoder[x])
ratings_df_final = final_df
final_df = final_df[['en_mbrNo','en_goodsNm','gdasScrVal']]
# ratings_df['gdasScrVal'] = ratings_df['gdasScrVal'].apply(lambda x : x//2)

test_df = final_df
test_df.columns = ['en_userid','en_productid','rating']
test_df.reset_index(drop=True,inplace=True)

In [34]:
oliveyoung_dataset = CustomDataset(df = test_df)
oliveyoung_loader = DataLoader(oliveyoung_dataset, batch_size = 1, shuffle = True, drop_last = False)

In [39]:
def inference(model,dataloader):
  output_list = []
  for user,item,label in tqdm(dataloader):
        user = user.to(device)
        item = item.to(device)
        label = label.to(device)
        label = label.float()
        output = model(user, item)
        output_list.append(output.detach().cpu().numpy())
  return output_list

pred = inference(gmf,oliveyoung_loader)


100%|██████████| 338958/338958 [04:04<00:00, 1386.59it/s]


In [40]:
# 아래와 같이 user가 어떤 제품에 평점을 몇점 정도 매길지 확인이 가능.
pred_df = ratings_df_final[['mbrNo','goodsNm','gdasScrVal']]
pred_df['predict_gdasScrVal'] = [float(pr[0]) for pr in pred]
pred_df

Unnamed: 0,mbrNo,goodsNm,gdasScrVal,predict_gdasScrVal
0,M0000012102162,[선우PICK]메디힐 티트리 진정 패드 더블 기획 (100매+100매 리필),1,1.031415e-10
1,M0000012102162,[선우PICK]메디힐 티트리 진정 패드 100매,0,1.796726e-11
2,M0000012102162,크리넥스 99.9% 항균 안심물티슈 10매,0,1.000000e+00
3,M0000012102162,C08,0,1.000000e+00
4,M0000012102162,"C08""",0,3.530038e-11
...,...,...,...,...
338953,M0000004588891,아이디얼 포맨 프레시 올인원 기획 (본품150ml+50ml 증정),0,1.921792e-03
338954,M0000004588891,필리밀리 손톱깎이 (S),0,3.792054e-09
338955,M0000004588891,W.DRESSROOM 퍼퓸 핸드크림 No.30 화이트머스크 50ml,0,9.648497e-01
338956,M0000004588891,필리밀리 코털가위,0,6.523021e-06


In [41]:
from tqdm import tqdm
res = []; user_lst = []; pred_rate = []
for _user in tqdm(list(pred_df['mbrNo'].unique())):
  temp = pred_df[pred_df['mbrNo']==_user]
  final_temp = temp[temp['gdasScrVal']==0]
  res.append(list(final_temp[final_temp['mbrNo']==_user].sort_values(by='predict_gdasScrVal',ascending=False)[:5].goodsNm))
  pred_rate.append(list(final_temp[final_temp['mbrNo']==_user].sort_values(by='predict_gdasScrVal',ascending=False)[:5].predict_gdasScrVal))
  user_lst.append(_user)

100%|██████████| 6277/6277 [02:17<00:00, 45.65it/s]


In [42]:
res_df = pd.DataFrame({'userId':user_lst, 'top_5_name':res,'prediction_rate':pred_rate})
res_df.to_csv('NCF_Res_GMF.csv',index=False)