In [None]:
!pip install python-box

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import math
import numpy as np
import scipy.sparse as sp
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/Shareddrives/DSL_Modeling_B

/content/drive/Shareddrives/DSL_Modeling_B


0. 텍스트 합치기

In [None]:
menu1 = pd.read_csv('dacon_menu_final.csv')
menu2 = pd.read_csv('01가락고_final.csv')

In [None]:
menu1

Unnamed: 0.1,Unnamed: 0,SessionID,Menu,MenuID
0,0,0,찐빵,0
1,1,0,오징어찌개,1
2,2,0,육개장,2
3,3,0,단호박샌드,3
4,4,0,김치찌개,4
...,...,...,...,...
135535,135535,4517,유채나물무침,1990
135536,135536,4517,채소계란찜,1517
135537,135537,4517,애호박나물볶음,2341
135538,135538,4517,청포묵*양념간장,1991


In [None]:
menu1['timestamp'] = list(range(1, 31))*4518

In [None]:
menu1

Unnamed: 0.1,Unnamed: 0,SessionID,Menu,MenuID,timestamp
0,0,0,찐빵,0,1
1,1,0,오징어찌개,1,2
2,2,0,육개장,2,3
3,3,0,단호박샌드,3,4
4,4,0,김치찌개,4,5
...,...,...,...,...,...
135535,135535,4517,유채나물무침,1990,26
135536,135536,4517,채소계란찜,1517,27
135537,135537,4517,애호박나물볶음,2341,28
135538,135538,4517,청포묵*양념간장,1991,29


In [None]:
menu2.head()

Unnamed: 0.1,Unnamed: 0,timestamp,Menu,MenuID,SessionID
0,0,20200701,김치볶음밥,2342.0,0
1,1,20200702,찜닭,655.0,0
2,2,20200703,부대찌개,54.0,0
3,3,20200706,짬뽕,63.0,0
4,4,20200707,오리불고기,868.0,0


In [None]:
menu2['sum'] = 4518

In [None]:
menu2['SessionID'] = menu2['SessionID'] + menu2['sum']

In [None]:
menu2

Unnamed: 0.1,Unnamed: 0,timestamp,Menu,MenuID,SessionID,sum
0,0,20200701,김치볶음밥,2342.0,4518,4518
1,1,20200702,찜닭,655.0,4518,4518
2,2,20200703,부대찌개,54.0,4518,4518
3,3,20200706,짬뽕,63.0,4518,4518
4,4,20200707,오리불고기,868.0,4518,4518
...,...,...,...,...,...,...
404,404,20221025,두부조림,1068.0,4531,4518
405,405,20221026,김밥볶음밥,2528.0,4531,4518
406,406,20221027,콩비지찌개,37.0,4531,4518
407,407,20221028,곤드레밥,2529.0,4531,4518


In [None]:
menu = pd.concat([menu1,menu2],axis=0, join='inner') 
menu

Unnamed: 0.1,Unnamed: 0,SessionID,Menu,MenuID,timestamp
0,0,0,찐빵,0.0,1
1,1,0,오징어찌개,1.0,2
2,2,0,육개장,2.0,3
3,3,0,단호박샌드,3.0,4
4,4,0,김치찌개,4.0,5
...,...,...,...,...,...
404,404,4531,두부조림,1068.0,20221025
405,405,4531,김밥볶음밥,2528.0,20221026
406,406,4531,콩비지찌개,37.0,20221027
407,407,4531,곤드레밥,2529.0,20221028


In [None]:
print(menu.shape,menu["SessionID"].nunique())

(135949, 5) 4532


In [None]:
# 새롭게 저장
menu.to_csv("dacon_menu_sj2.csv")

1. 학습 설정

In [None]:
config = {
    'data_path' : "/content/drive/Shareddrives/DSL_Modeling_B" , # 데이터 경로
    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

2. 데이터 전처리

In [None]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'dacon_menu_sj2.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('MenuID')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('SessionID')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['MenuID'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['SessionID'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['timestamp']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat

    def make_sparse_matrix(self):
        X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        for user in self.user_train.keys():
            item_list = self.user_train[user]
            X[user, item_list] = 1.0
                
        return X.tocsr()

In [None]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

3. 모델

In [None]:
import numpy as np
from scipy import sparse
from copy import deepcopy

class AdmmSlim():
    def __init__(self, lambda_1=10, lambda_2=5, rho=1000, positive=True, n_iter=50, eps_rel=1e-4, eps_abs=1e-3, verbose=False):
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.rho = rho
        self.positive = positive
        self.n_iter = n_iter
        self.eps_rel = eps_rel
        self.eps_abs = eps_abs
        self.verbose = verbose
    
    def soft_thresholding(self, B, Gamma):
        if self.lambda_1 == 0:
            if self.positive:
                return np.abs(B)
            else:
                return B
        else:
            x = B + Gamma / self.rho
            threshold = self.lambda_1 / self.rho
            if self.positive:
                return np.where(threshold < x, x - threshold, 0)
            else:
                return np.where(threshold < x, x - threshold,
                                np.where(x < - threshold, x + threshold, 0))

    def is_converged(self, B, C, C_old, Gamma):
        B_norm = np.linalg.norm(B)
        C_norm = np.linalg.norm(C)
        Gamma_norm = np.linalg.norm(Gamma)

        eps_primal = self.eps_abs * B.shape[0] - self.eps_rel * np.max([B_norm, C_norm])
        eps_dual = self.eps_abs * B.shape[0] - self.eps_rel * Gamma_norm

        R_primal_norm = np.linalg.norm(B - C)
        R_dual_norm = np.linalg.norm(C  - C_old) * self.rho

        converged = R_primal_norm < eps_primal and R_dual_norm < eps_dual
        return converged

    def fit(self, X):
        XtX = X.T.dot(X)
        if sparse.issparse(XtX):
            XtX = XtX.todense().A

        if self.verbose:
            print(' --- init')
        identity_mat = np.identity(XtX.shape[0])
        diags = identity_mat * (self.lambda_2 + self.rho)
        P = np.linalg.inv(XtX + diags).astype(np.float32)
        B_aux = P.dot(XtX)

        Gamma = np.zeros_like(XtX, dtype=np.float32)
        C = np.zeros_like(XtX, dtype=np.float32)

        if self.verbose:
            print(' --- iteration start.')
        for iter in range(self.n_iter):
            if self.verbose:
                print(f' --- iteration {iter+1}/{self.n_iter}')
            C_old = C.copy()
            B_tilde = B_aux + P.dot(self.rho * C - Gamma)
            gamma = np.diag(B_tilde) / (np.diag(P) + 1e-8)
            B = B_tilde - P * gamma
            C = self.soft_thresholding(B, Gamma)
            Gamma = Gamma + self.rho * (B - C)
            if self.is_converged(B, C, C_old, Gamma):
                if self.verbose:
                    print(f' --- Converged. Stopped iteration.')
                break

        coef = C

        self.pred = torch.from_numpy(X.dot(coef))

4. 학습 함수

In [None]:
def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, X, user_train, user_valid):

    mat = torch.from_numpy(X)

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    recon_mat1 = model.pred.cpu()
    recon_mat1[mat == 1] = -np.inf
    rec_list1 = recon_mat1.argsort(dim = 1)

    for user, rec1 in tqdm(enumerate(rec_list1)):
        uv = user_valid[user]

        # ranking
        up = rec1[-10:].cpu().numpy().tolist()[::-1]

        NDCG += get_ndcg(pred_list = up, true_list = uv)
        HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(user_train)
    HIT /= len(user_train)

    return NDCG, HIT

5. 학습

In [None]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()
X = make_matrix_data_set.make_sparse_matrix()

In [None]:
model = AdmmSlim(n_iter=10)
model.fit(X = X)
ndcg, hit = evaluate(model = model, X = X.todense(), user_train = user_train, user_valid = user_valid)
print(f'NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

4532it [00:00, 29082.95it/s]

NDCG@10: 0.17258| HIT@10: 0.12376



