# EXPLORATION_16 다음에 볼 영화 예측하기

## 진행 과정
1. Data Preprocess
    - Data Load
    - Session Length
    - Session Time
    - Data split
   
   
2. Data Pipeline
    - Session Dataset
    - Session Data Loader
    
    
3. Modeling
    - Evaluation Matric
    - Model Architecture
    - Model Training


4. 회고 및 정리

### Data Preprocess - Data Load

In [1]:
# 사용할 라이브러리 불러오기

import matplotlib.pyplot as plt
import datetime as dt
from pathlib import Path
import os

import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기

data_path = Path(os.getenv('HOME')+'/aiffel/yoochoose/data/') 
train_path = data_path / 'ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [3]:
# Data 정보 확인

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 31 to 1000042
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   UserId  1000209 non-null  int32
 1   ItemId  1000209 non-null  int32
 2   Rating  1000209 non-null  int32
 3   Time    1000209 non-null  int64
dtypes: int32(3), int64(1)
memory usage: 26.7 MB


In [4]:
# Time을 표준시로 변경해주기 - 현재 시간이 UTC time으로 되어 있기 때문에 변경해주는 것이 작업하기가 쉽다

data['Time']=pd.to_datetime(data['Time'],unit='s')
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


  - 사용하고 있는 데이터에는 Session에 대한 내용이 들어있지 않고 User 항목이 들어가 있다.
  - Session에 대한 기준이 필요하기 때문에 user와 time을 이용해서 Session을 정의해준다.

In [5]:
df = data.groupby(['UserId', 'Time'])['ItemId'].count().reset_index()
df.reset_index(inplace=True)
df

Unnamed: 0,index,UserId,Time,ItemId
0,0,1,2000-12-31 22:00:19,1
1,1,1,2000-12-31 22:00:55,3
2,2,1,2000-12-31 22:01:43,1
3,3,1,2000-12-31 22:02:52,1
4,4,1,2000-12-31 22:04:35,1
...,...,...,...,...
471158,471158,6040,2001-08-10 14:39:58,1
471159,471159,6040,2001-08-10 14:40:29,1
471160,471160,6040,2001-08-10 14:41:04,2
471161,471161,6040,2001-08-10 14:41:26,1


In [6]:
data = pd.merge(data, df, on=['UserId', 'Time'])
data.drop(columns='ItemId_y', inplace=True)
data.columns = ['UserId', 'ItemId', 'Rating', 'Time', 'SessionId']
data

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
0,1,3186,4,2000-12-31 22:00:19,0
1,1,1270,5,2000-12-31 22:00:55,1
2,1,1721,4,2000-12-31 22:00:55,1
3,1,1022,5,2000-12-31 22:00:55,1
4,1,2340,3,2000-12-31 22:01:43,2
...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159
1000205,6040,1921,4,2001-08-10 14:41:04,471160
1000206,6040,1784,3,2001-08-10 14:41:04,471160
1000207,6040,161,3,2001-08-10 14:41:26,471161


In [7]:
# id의 수와 item의 수 확인하기

data['SessionId'].nunique(), data['ItemId'].nunique()

(471163, 3706)

### Data Preprocess - Session Lenght

  - 각 세션이 대략 몇 개의 클릭 데이터를 가지는지 살펴본다.

In [8]:
session_length = data.groupby('SessionId').size()
session_length

SessionId
0         1
1         3
2         1
3         1
4         1
         ..
471158    1
471159    1
471160    2
471161    1
471162    1
Length: 471163, dtype: int64

In [9]:
session_length.median(), session_length.mean()

(2.0, 2.1228513274599234)

In [10]:
session_length.min(), session_length.max()

(1, 30)

In [11]:
session_length.quantile(0.999)

10.0

### Data Preprocess - Session Time

  - 추천 시스템을 구축할 때에는 최근 소비 트렌드를 학습하는 것이 중요하다.
  - 그렇기 때문에 데이터가 발생한 시간에 대해서 살펴봐야 한다.

In [12]:
oldest, latest = data['Time'].min(), data['Time'].max()
print(oldest) 
print(latest)

2000-04-25 23:05:32
2003-02-28 17:49:50


### Data Cleansing

  - Rating을 통해서 영화 평점이 3점 이상인 영화값만을 제외하고 나머지는 제거해준다.
  - 길이가 30인 세션의 경우 1초만에 30개의 영화를 평가했으므로 이상치로 판단하여 제거한다.

In [13]:
long_session = session_length[session_length==30].index[0]
display(data[data['SessionId']==long_session])
data[data['SessionId']==long_session].shape

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
112347,731,3044,4,2000-11-29 20:06:42,55117
112348,731,1455,3,2000-11-29 20:06:42,55117
112349,731,1639,5,2000-11-29 20:06:42,55117
112350,731,3244,4,2000-11-29 20:06:42,55117
112351,731,1656,2,2000-11-29 20:06:42,55117
112352,731,3426,4,2000-11-29 20:06:42,55117
112353,731,1829,2,2000-11-29 20:06:42,55117
112354,731,2675,4,2000-11-29 20:06:42,55117
112355,731,802,3,2000-11-29 20:06:42,55117
112356,731,803,5,2000-11-29 20:06:42,55117


(30, 5)

In [14]:
data = data.loc[data['SessionId'] != long_session]
data

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
0,1,3186,4,2000-12-31 22:00:19,0
1,1,1270,5,2000-12-31 22:00:55,1
2,1,1721,4,2000-12-31 22:00:55,1
3,1,1022,5,2000-12-31 22:00:55,1
4,1,2340,3,2000-12-31 22:01:43,2
...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159
1000205,6040,1921,4,2001-08-10 14:41:04,471160
1000206,6040,1784,3,2001-08-10 14:41:04,471160
1000207,6040,161,3,2001-08-10 14:41:26,471161


In [15]:
data = data[data['Rating'] >= 3]
data

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
0,1,3186,4,2000-12-31 22:00:19,0
1,1,1270,5,2000-12-31 22:00:55,1
2,1,1721,4,2000-12-31 22:00:55,1
3,1,1022,5,2000-12-31 22:00:55,1
4,1,2340,3,2000-12-31 22:01:43,2
...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159
1000205,6040,1921,4,2001-08-10 14:41:04,471160
1000206,6040,1784,3,2001-08-10 14:41:04,471160
1000207,6040,161,3,2001-08-10 14:41:26,471161


### Data Split

  - 모델 평가를 위해 valid set과 test set을 만들어줘야 한다.

In [16]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    session_last_time = data.groupby('SessionId')['Time'].max()
    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(n_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(n_days)].index

    before_date = data[data['SessionId'].isin(session_in_train)]
    after_date = data[data['SessionId'].isin(session_in_test)]
    after_date = after_date[after_date['ItemId'].isin(before_date['ItemId'])]
    return before_date, after_date

In [17]:
# test set은 최근 3개월까지의 데이터로 구성하고 val set은 최근 1년까지의 데이터로 구성한다.

tr, test = split_by_date(data, n_days=100)
tr, val = split_by_date(tr, n_days=365)

In [18]:
# data에 대한 정보를 살펴봅니다.
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["SessionId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [19]:
stats_info(tr, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 810327
	 Sessions: 404871
	 Items: 3612
	 First Time : 2000-04-25 23:05:32
	 Last Time : 2001-11-20 05:13:09

* valid Set Stats Info
	 Events: 21991
	 Sessions: 15450
	 Items: 2820
	 First Time : 2001-11-20 19:04:49
	 Last Time : 2002-11-20 16:38:40

* test Set Stats Info
	 Events: 4118
	 Sessions: 3071
	 Items: 1625
	 First Time : 2002-11-20 20:30:02
	 Last Time : 2003-02-28 17:49:50



In [20]:
# train set에 없는 아이템이 val, test기간에 생길 수 있으므로 train data를 기준으로 인덱싱합니다.
id2idx = {item_id : index for index, item_id in enumerate(tr['ItemId'].unique())}

def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))  # id2idx에 없는 아이템은 모르는 값(-1) 처리 해줍니다.
    return df

tr = indexing(tr, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [21]:
# 데이터 저장

save_path = data_path / 'processed'
save_path.mkdir(parents=True, exist_ok=True)

tr.to_pickle(save_path / 'train.pkl')
val.to_pickle(save_path / 'valid.pkl')
test.to_pickle(save_path / 'test.pkl')

### Data Pipeline - Session Dataset

  - 데이터가 주어지면 세션이 시작되는 인덱스를 담는 값과 세션을 새로 인덱싱한 값을 갖는 클래스를 만들어준다.
 

In [22]:
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()
        self.session_idx = np.arange(self.df['SessionId'].nunique())  # indexing to SessionId

    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['SessionId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('SessionId').size().cumsum()
        return offsets

In [23]:
tr_dataset = SessionDataset(tr)
tr_dataset.df.head(10)

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId,item_idx
0,1,3186,4,2000-12-31 22:00:19,0,0
1,1,1270,5,2000-12-31 22:00:55,1,1
2,1,1721,4,2000-12-31 22:00:55,1,2
3,1,1022,5,2000-12-31 22:00:55,1,3
4,1,2340,3,2000-12-31 22:01:43,2,4
5,1,1836,5,2000-12-31 22:02:52,3,5
6,1,3408,4,2000-12-31 22:04:35,4,6
7,1,2804,5,2000-12-31 22:11:59,5,7
8,1,1207,4,2000-12-31 22:11:59,5,8
9,1,1193,5,2000-12-31 22:12:40,6,9


In [24]:
tr_dataset.click_offsets

array([     0,      1,      4, ..., 810325, 810326, 810327], dtype=int32)

In [25]:
tr_dataset.session_idx

array([     0,      1,      2, ..., 404868, 404869, 404870])

### Data Pipeline - Session Data Loader

In [26]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        start, end, mask, last_session, finished = self.initialize()  # initialize 메소드에서 확인해주세요.
        """
        start : Index Where Session Start
        end : Index Where Session End
        mask : indicator for the sessions to be terminated
        """

        while not finished:
            min_len = (end - start).min() - 1  # Shortest Length Among Sessions
            for i in range(min_len):
                # Build inputs & targets
                inp = self.dataset.df['item_idx'].values[start + i]
                target = self.dataset.df['item_idx'].values[start + i + 1]
                yield inp, target, mask

            start, end, mask, last_session, finished = self.update_status(start, end, min_len, last_session, finished)

    def initialize(self):
        first_iters = np.arange(self.batch_size)    # 첫 배치에 사용할 세션 Index를 가져옵니다.
        last_session = self.batch_size - 1    # 마지막으로 다루고 있는 세션 Index를 저장해둡니다.
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]       # data 상에서 session이 시작된 위치를 가져옵니다.
        end = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]  # session이 끝난 위치 바로 다음 위치를 가져옵니다.
        mask = np.array([])   # session의 모든 아이템을 다 돌은 경우 mask에 추가해줄 것입니다.
        finished = False         # data를 전부 돌았는지 기록하기 위한 변수입니다.
        return start, end, mask, last_session, finished

    def update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):  
        # 다음 배치 데이터를 생성하기 위해 상태를 update합니다.
        
        start += min_len   # __iter__에서 min_len 만큼 for문을 돌았으므로 start를 min_len 만큼 더해줍니다.
        mask = np.arange(self.batch_size)[(end - start) == 1]  
        # end는 다음 세션이 시작되는 위치인데 start와 한 칸 차이난다는 것은 session이 끝났다는 뜻입니다. mask에 기록해줍니다.

        for i, idx in enumerate(mask, start=1):  # mask에 추가된 세션 개수만큼 새로운 세션을 돌것입니다.
            new_session = last_session + i  
            if new_session > self.dataset.session_idx[-1]:  # 만약 새로운 세션이 마지막 세션 index보다 크다면 모든 학습데이터를 돈 것입니다.
                finished = True
                break
            # update the next starting/ending point
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]     # 종료된 세션 대신 새로운 세션의 시작점을 기록합니다.
            end[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session] + 1]

        last_session += len(mask)  # 마지막 세션의 위치를 기록해둡니다.
        return start, end, mask, last_session, finished

In [27]:
tr_data_loader = SessionDataLoader(tr_dataset, batch_size=4)
tr_dataset.df.head(15)

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId,item_idx
0,1,3186,4,2000-12-31 22:00:19,0,0
1,1,1270,5,2000-12-31 22:00:55,1,1
2,1,1721,4,2000-12-31 22:00:55,1,2
3,1,1022,5,2000-12-31 22:00:55,1,3
4,1,2340,3,2000-12-31 22:01:43,2,4
5,1,1836,5,2000-12-31 22:02:52,3,5
6,1,3408,4,2000-12-31 22:04:35,4,6
7,1,2804,5,2000-12-31 22:11:59,5,7
8,1,1207,4,2000-12-31 22:11:59,5,8
9,1,1193,5,2000-12-31 22:12:40,6,9


In [28]:
iter_ex = iter(tr_data_loader)

In [29]:
inputs, labels, mask =  next(iter_ex)
print(f'Model Input Item Idx are : {inputs}')
print(f'Label Item Idx are : {"":5} {labels}')
print(f'Previous Masked Input Idx are {mask}')

Model Input Item Idx are : [19  1  7  9]
Label Item Idx are :       [20  2  8 10]
Previous Masked Input Idx are [0]


### Modeling - Evaluation Metric

  - 모델 성능 평가를 위한 지표로는 precision 이나 recall이 있다.
  - precision은 한국말로는 정밀도를 의미한다. 영화 추천의 문제라고 한다면 실제로 추천한 영화중에 사용자가 선호하는 영화는 얼마나 되었나?를 의미한다.
  - recall은 한국말로 재현율을 의미한다. 영화 추천의 문제라고 한다면 실제 사용자가 선호하는 영화를 추천에서 얼마나 잘 맞췄나?를 의미한다.
  - Session-Based Recommendation Task에서는 모델이 K개의 아이템을 제시했을 때, 유저가 클릭/ 구매한 n개의 아이템이 많아야 좋다.
  - 그렇기 때문에 recall의 개념을 확장한 recall@k 지표와 precision의 개념을 확장한 Mean Average Precision@k 지표를 사용한다.

In [30]:
def mrr_k(pred, truth: int, k: int):
    indexing = np.where(pred[:k] == truth)[0]
    if len(indexing) > 0:
        return 1 / (indexing[0] + 1)
    else:
        return 0


def recall_k(pred, truth: int, k: int) -> int:
    answer = truth in pred[:k]
    return int(answer)

### Modeling - Model Architecture

In [40]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GRU
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [42]:
def create_model(args):
    inputs = Input(batch_shape=(args.batch_size, 1, args.num_items))
    gru, _ = GRU(args.hsz, stateful=True, return_state=True, name='GRU')(inputs)
    dropout = Dropout(args.drop_rate)(gru)
    predictions = Dense(args.num_items, activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=[predictions])
    model.compile(loss=categorical_crossentropy, optimizer=Adam(args.lr), metrics=['accuracy'])
    model.summary()
    return model

In [43]:
# 모델에 사용되는 하이퍼 파라미터

class Args:
    def __init__(self, tr, val, test, batch_size, hsz, drop_rate, lr, epochs, k):
        self.tr = tr
        self.val = val
        self.test = test
        self.num_items = tr['ItemId'].nunique()
        self.num_sessions = tr['SessionId'].nunique()
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

In [44]:
args = Args(tr, val, test, batch_size=128, hsz=50, drop_rate=0.2, lr=0.001, epochs=5, k=20)

model = create_model(args)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(128, 1, 3612)]          0         
_________________________________________________________________
GRU (GRU)                    [(128, 50), (128, 50)]    549600    
_________________________________________________________________
dropout_1 (Dropout)          (128, 50)                 0         
_________________________________________________________________
dense_1 (Dense)              (128, 3612)               184212    
Total params: 733,812
Trainable params: 733,812
Non-trainable params: 0
_________________________________________________________________


### Modeling - Model Training

  - 모델에 사용되는 하이퍼 파라미터는 args에서 관리한다.

In [45]:
def train_model(model, args):
    train_dataset = SessionDataset(args.tr)
    train_loader = SessionDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        total_step = len(args.tr) - args.tr['SessionId'].nunique()
        tr_loader = tqdm(train_loader, total=total_step // args.batch_size, desc='Train', mininterval=1)
        for feat, target, mask in tr_loader:
            reset_hidden_states(model, mask)  #종료된 session은 hidden_state를 초기화. 아래 메서드에서 확인할 수 있음.

            input_ohe = to_categorical(feat, num_classes=args.num_items)
            input_ohe = np.expand_dims(input_ohe, axis=1)
            target_ohe = to_categorical(target, num_classes=args.num_items)

            result = model.train_on_batch(input_ohe, target_ohe)
            tr_loader.set_postfix(train_loss=result[0], accuracy = result[1])

        val_recall, val_mrr = get_metrics(args.val, model, args, args.k)  #valid set에 대해 검증.

        print(f"\t - Recall@{args.k} epoch {epoch}: {val_recall:3f}")
        print(f"\t - MRR@{args.k}    epoch {epoch}: {val_mrr:3f}\n")


def reset_hidden_states(model, mask):
    gru_layer = model.get_layer(name='GRU')  #model에서 gru layer를 가져옴.
    hidden_states = gru_layer.states[0].numpy()  #gru_layer의 parameter를 가져옴.
    for elt in mask:  #mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  #parameter를 초기화 함.
    gru_layer.reset_states(states=hidden_states)


#valid셋과 test셋을 평가하는 코드
def get_metrics(data, model, args, k: int): 
    
    #train과 거의 같지만 mrr, recall을 구하는 라인이 있음.
    dataset = SessionDataset(data)
    loader = SessionDataLoader(dataset, batch_size=args.batch_size)
    recall_list, mrr_list = [], []

    total_step = len(data) - data['SessionId'].nunique()
    for inputs, label, mask in tqdm(loader, total=total_step // args.batch_size, desc='Evaluation', mininterval=1):
        reset_hidden_states(model, mask)
        input_ohe = to_categorical(inputs, num_classes=args.num_items)
        input_ohe = np.expand_dims(input_ohe, axis=1)

        pred = model.predict(input_ohe, batch_size=args.batch_size)
        pred_arg = tf.argsort(pred, direction='DESCENDING')  #softmax 값이 큰 순서대로 sorting.

        length = len(inputs)
        recall_list.extend([recall_k(pred_arg[i], label[i], k) for i in range(length)])
        mrr_list.extend([mrr_k(pred_arg[i], label[i], k) for i in range(length)])

    recall, mrr = np.mean(recall_list), np.mean(mrr_list)
    return recall, mrr

In [46]:
# 모델 학습 / 하이퍼파라미터 : batch_size=128, hsz=50, drop_rate=0.2, lr=0.001, epochs=5, k=20
train_model(model, args)

Train: 100%|█████████▉| 3166/3167 [00:48<00:00, 64.78it/s, accuracy=0.0391, train_loss=6.46] 
Evaluation:  98%|█████████▊| 50/51 [00:29<00:00,  1.71it/s]


	 - Recall@20 epoch 1: 0.120469
	 - MRR@20    epoch 1: 0.036000



Train: 100%|█████████▉| 3166/3167 [00:47<00:00, 66.69it/s, accuracy=0.0391, train_loss=6.09] 
Evaluation:  98%|█████████▊| 50/51 [00:27<00:00,  1.81it/s]


	 - Recall@20 epoch 2: 0.210313
	 - MRR@20    epoch 2: 0.069102



Train: 100%|█████████▉| 3166/3167 [00:47<00:00, 66.70it/s, accuracy=0.0391, train_loss=5.94] 
Evaluation:  98%|█████████▊| 50/51 [00:27<00:00,  1.85it/s]


	 - Recall@20 epoch 3: 0.237813
	 - MRR@20    epoch 3: 0.077683



Train: 100%|█████████▉| 3166/3167 [00:46<00:00, 67.37it/s, accuracy=0.0391, train_loss=5.81] 
Evaluation:  98%|█████████▊| 50/51 [00:26<00:00,  1.86it/s]


	 - Recall@20 epoch 4: 0.250937
	 - MRR@20    epoch 4: 0.082494



Train: 100%|█████████▉| 3166/3167 [00:46<00:00, 67.65it/s, accuracy=0.0547, train_loss=5.71] 
Evaluation:  98%|█████████▊| 50/51 [00:26<00:00,  1.87it/s]

	 - Recall@20 epoch 5: 0.254219
	 - MRR@20    epoch 5: 0.086315






In [47]:
# 모델 평가

def test_model(model, args, test):
    test_recall, test_mrr = get_metrics(test, model, args, 20)
    print(f"\t - Recall@{args.k}: {test_recall:3f}")
    print(f"\t - MRR@{args.k}: {test_mrr:3f}\n")

In [48]:
test_model(model, args, test)

Evaluation:  88%|████████▊ | 7/8 [00:03<00:00,  1.89it/s]

	 - Recall@20: 0.254464
	 - MRR@20: 0.098726






### 하이퍼파라미터 변경 후 모델 훈련 및 평가

In [49]:
#하이퍼파라미터 변경 수치 : batch_size=256, hsz=50, drop_rate=0.2, lr=0.001, epochs=15, k=20

args = Args(tr, val, test, batch_size=256, hsz=50, drop_rate=0.2, lr=0.001, epochs=15, k=20)

model = create_model(args)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(256, 1, 3612)]          0         
_________________________________________________________________
GRU (GRU)                    [(256, 50), (256, 50)]    549600    
_________________________________________________________________
dropout_2 (Dropout)          (256, 50)                 0         
_________________________________________________________________
dense_2 (Dense)              (256, 3612)               184212    
Total params: 733,812
Trainable params: 733,812
Non-trainable params: 0
_________________________________________________________________


In [50]:
# 모델 학습
train_model(model, args)

Train: 100%|█████████▉| 1582/1583 [00:35<00:00, 44.55it/s, accuracy=0.0195, train_loss=7.04] 
Evaluation:  96%|█████████▌| 24/25 [00:27<00:01,  1.15s/it]


	 - Recall@20 epoch 1: 0.073730
	 - MRR@20    epoch 1: 0.018908



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.05it/s, accuracy=0.0391, train_loss=6.51] 
Evaluation:  96%|█████████▌| 24/25 [00:25<00:01,  1.08s/it]


	 - Recall@20 epoch 2: 0.162272
	 - MRR@20    epoch 2: 0.053034



Train: 100%|█████████▉| 1582/1583 [00:34<00:00, 46.15it/s, accuracy=0.0547, train_loss=6.23]
Evaluation:  96%|█████████▌| 24/25 [00:25<00:01,  1.05s/it]


	 - Recall@20 epoch 3: 0.212565
	 - MRR@20    epoch 3: 0.072195



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.39it/s, accuracy=0.0586, train_loss=6.17] 
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.03s/it]


	 - Recall@20 epoch 4: 0.236328
	 - MRR@20    epoch 4: 0.079205



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.87it/s, accuracy=0.0586, train_loss=6.1]  
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.02s/it]


	 - Recall@20 epoch 5: 0.249349
	 - MRR@20    epoch 5: 0.083413



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.77it/s, accuracy=0.0469, train_loss=6.02] 
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.02s/it]


	 - Recall@20 epoch 6: 0.254232
	 - MRR@20    epoch 6: 0.086834



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.66it/s, accuracy=0.0625, train_loss=5.99] 
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 7: 0.257161
	 - MRR@20    epoch 7: 0.088727



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.94it/s, accuracy=0.0586, train_loss=5.92]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 8: 0.259115
	 - MRR@20    epoch 8: 0.088966



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.46it/s, accuracy=0.0469, train_loss=5.97]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 9: 0.258952
	 - MRR@20    epoch 9: 0.090307



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.84it/s, accuracy=0.0547, train_loss=5.87] 
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 10: 0.262207
	 - MRR@20    epoch 10: 0.090900



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.68it/s, accuracy=0.0547, train_loss=5.86]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 11: 0.264323
	 - MRR@20    epoch 11: 0.091127



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.78it/s, accuracy=0.0625, train_loss=5.87]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.02s/it]


	 - Recall@20 epoch 12: 0.265625
	 - MRR@20    epoch 12: 0.091593



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.81it/s, accuracy=0.0781, train_loss=5.86]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 13: 0.265462
	 - MRR@20    epoch 13: 0.092109



Train: 100%|█████████▉| 1582/1583 [00:33<00:00, 47.78it/s, accuracy=0.0781, train_loss=5.81]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]


	 - Recall@20 epoch 14: 0.265788
	 - MRR@20    epoch 14: 0.092421



Train: 100%|█████████▉| 1582/1583 [00:32<00:00, 48.24it/s, accuracy=0.0742, train_loss=5.78]
Evaluation:  96%|█████████▌| 24/25 [00:24<00:01,  1.01s/it]

	 - Recall@20 epoch 15: 0.265462
	 - MRR@20    epoch 15: 0.092443






In [51]:
# 모델 평가

test_model(model, args, test)

Evaluation:  75%|███████▌  | 3/4 [00:02<00:00,  1.02it/s]

	 - Recall@20: 0.272135
	 - MRR@20: 0.106520






In [52]:
#하이퍼파라미터 변경 수치 : batch_size=125, hsz=50, drop_rate=0.1, lr=0.001, epochs=15, k=20

args = Args(tr, val, test, batch_size=125, hsz=50, drop_rate=0.1, lr=0.001, epochs=15, k=20)

model = create_model(args)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(125, 1, 3612)]          0         
_________________________________________________________________
GRU (GRU)                    [(125, 50), (125, 50)]    549600    
_________________________________________________________________
dropout_3 (Dropout)          (125, 50)                 0         
_________________________________________________________________
dense_3 (Dense)              (125, 3612)               184212    
Total params: 733,812
Trainable params: 733,812
Non-trainable params: 0
_________________________________________________________________


In [53]:
# 모델 학습
train_model(model, args)

Train: 100%|█████████▉| 3242/3243 [00:49<00:00, 65.36it/s, accuracy=0.016, train_loss=6.49]
Evaluation:  98%|█████████▊| 51/52 [00:28<00:00,  1.79it/s]


	 - Recall@20 epoch 1: 0.130667
	 - MRR@20    epoch 1: 0.038562



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 67.89it/s, accuracy=0.048, train_loss=6.1] 
Evaluation:  98%|█████████▊| 51/52 [00:27<00:00,  1.89it/s]


	 - Recall@20 epoch 2: 0.210510
	 - MRR@20    epoch 2: 0.068886



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.05it/s, accuracy=0.04, train_loss=5.98] 
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.94it/s]


	 - Recall@20 epoch 3: 0.239686
	 - MRR@20    epoch 3: 0.079323



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.49it/s, accuracy=0.048, train_loss=5.92]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.94it/s]


	 - Recall@20 epoch 4: 0.252078
	 - MRR@20    epoch 4: 0.084173



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.02it/s, accuracy=0.048, train_loss=5.77]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.96it/s]


	 - Recall@20 epoch 5: 0.259137
	 - MRR@20    epoch 5: 0.087134



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.50it/s, accuracy=0.04, train_loss=5.73] 
Evaluation:  98%|█████████▊| 51/52 [00:25<00:00,  1.96it/s]


	 - Recall@20 epoch 6: 0.261647
	 - MRR@20    epoch 6: 0.087710



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.23it/s, accuracy=0.04, train_loss=5.71] 
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.95it/s]


	 - Recall@20 epoch 7: 0.260235
	 - MRR@20    epoch 7: 0.088042



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.38it/s, accuracy=0.056, train_loss=5.63]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.95it/s]


	 - Recall@20 epoch 8: 0.261176
	 - MRR@20    epoch 8: 0.088925



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.15it/s, accuracy=0.032, train_loss=5.65]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.95it/s]


	 - Recall@20 epoch 9: 0.259922
	 - MRR@20    epoch 9: 0.089283



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.24it/s, accuracy=0.04, train_loss=5.58] 
Evaluation:  98%|█████████▊| 51/52 [00:25<00:00,  1.97it/s]


	 - Recall@20 epoch 10: 0.259922
	 - MRR@20    epoch 10: 0.089700



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.46it/s, accuracy=0.048, train_loss=5.62]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.96it/s]


	 - Recall@20 epoch 11: 0.260549
	 - MRR@20    epoch 11: 0.090264



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.72it/s, accuracy=0.04, train_loss=5.58] 
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.95it/s]


	 - Recall@20 epoch 12: 0.259608
	 - MRR@20    epoch 12: 0.090234



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.68it/s, accuracy=0.064, train_loss=5.55]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.96it/s]


	 - Recall@20 epoch 13: 0.258196
	 - MRR@20    epoch 13: 0.089917



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.56it/s, accuracy=0.072, train_loss=5.52]
Evaluation:  98%|█████████▊| 51/52 [00:25<00:00,  1.98it/s]


	 - Recall@20 epoch 14: 0.256784
	 - MRR@20    epoch 14: 0.089255



Train: 100%|█████████▉| 3242/3243 [00:47<00:00, 68.27it/s, accuracy=0.072, train_loss=5.52]
Evaluation:  98%|█████████▊| 51/52 [00:26<00:00,  1.95it/s]

	 - Recall@20 epoch 15: 0.256784
	 - MRR@20    epoch 15: 0.088855






In [54]:
# 모델 평가

test_model(model, args, test)

Evaluation:  88%|████████▊ | 7/8 [00:03<00:00,  1.99it/s]

	 - Recall@20: 0.282286
	 - MRR@20: 0.097498






### 회고 및 정리

1. 결과 분석
    - 하이퍼 파라미터 수치 : batch_size=128, hsz=50, drop_rate=0.2, lr=0.001, epochs=5, k=20 / 결과값 : Recall@20: 0.254464 /  MRR@20: 0.098726
    - 하이퍼 파라미터 수치 : batch_size=256, hsz=50, drop_rate=0.2, lr=0.001, epochs=15, k=20 / 결과값 : Recall@20: 0.272135 / MRR@20: 0.106520
    - 하이퍼 파라미터 수치 : batch_size=125, hsz=50, drop_rate=0.1, lr=0.001, epochs=15, k=20 / 결과값 : Recall@20: 0.282286 / MRR@20: 0.097498

2. 회고
  - 수치상으로만 놓고 보면 가장 좋은 결과값은 batch_size=256, hsz=50, drop_rate=0.2, lr=0.001, epochs=15, k=20 인 것 같다.
  - 이번 노드를 진행하면서 NLP노드나 이런 추천 관련 노드들은 모델이 복잡하다는 것을 다시 한번 느낄 수 있었다.
  - 이번 노드는 너무 어려워서 지루하거나 못하겠다는 느낌은 아니어서 그래도 재미있게 진행할 수 있었다.
  
  
  - 진행하면서 특별히 어려웠던 점이 있다면 이번 데이터에서는 연습 노드의 데이터와 다르게 sessionId가 존재하지 않고 userId와 time을 가지고 sessionId를 만들어서 진행해야 했다.
  - 처음 그 과정을 잘 이해하지 못해서 깃허브에서 먼저 노드를 진행한 다른 수강생분의 노드를 한번 참고해서 진행해서 그 부분을 해결할 수 있었다.
  - 특히 마지막 모델 트레이닝 과정에서 ValueError: Input 0 is incompatible with layer model: expected shape=(None, 14999, 7), found shape=(None, 7) 이와 같은 오류가 떠서 한참 고생한 것 같다.
  - 문제를 추측해본 결과 모델을 만들 때 inputs = Input(batch_shape=(args.batch_size, 1, args.num_items)) 이부분에서 문제가 있는 것 같았는데 정확하게 확신할 수는 없었다.
  - 그래서 결국 처음부터 코드를 하나씩 다시 만들어서 진행해서 해결되었다.
  - 아직도 정확하게 어느 부분이 문제가 있는지 이해하지 못했지만 내가 고친 부분은 하이퍼 파라미터를 지정해주는 부분이었다.
  - 기존에는 하이퍼파라미터를 클래스로 지정해주고 바로 수치까지 한번에 적어주었는데 그렇게 하지 않고 클래스를 먼저 지정한 뒤 수치를 지정하는것은 그 다음 코드에 적었더니 오류 없이 진행이 되었다.
  
  
  - 중간에 오류로 인해서 솔직히 답답한 부분이 있었는데 그래도 오류가 해결되고 결과를 받으니까 문제를 해결했다는 성취감이 들었다.