# Exploration17
## 다음에 볼 영화 예측하기
### 필요한 모듈 불러오기

In [72]:
import pandas as pd
import tensorflow
import datetime as dt
from pathlib import Path
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')


### 데이터 불러오기

In [73]:
data_path = Path(os.getenv('HOME')+'/aiffel/yoochoose/data/') 
train_path = data_path / 'ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


### 데이터 전처리

In [74]:
data['UserId'].nunique(), data['ItemId'].nunique()

(6040, 3706)

In [75]:
session_length = data.groupby('UserId').size()
session_length

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6040, dtype: int64

In [76]:
session_length.median(), session_length.mean()

(96.0, 165.5975165562914)

In [77]:
session_length.min(), session_length.max()

(20, 2314)

In [78]:
session_length.quantile(0.999)

1343.181000000005

In [79]:
session_length[session_length>1344]

UserId
889     1518
1181    1521
1680    1850
1941    1595
4169    2314
4277    1743
dtype: int64

### 나는 이 유저들을 광클유저로보고 삭제할것이다.

In [80]:
data = data.drop(index = data[data['UserId']==889].index)
data = data.drop(index = data[data['UserId']==1181].index)
data = data.drop(index = data[data['UserId']==1680].index)
data = data.drop(index = data[data['UserId']==1941].index)
data = data.drop(index = data[data['UserId']==4169].index)
data = data.drop(index = data[data['UserId']==4277].index)

In [81]:
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [82]:
session_length = data.groupby('UserId').size()
session_length

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6034, dtype: int64

In [83]:
session_length.min(), session_length.max()

(20, 1344)

### 성공했다. 이제 다음 단계로 넘어가자.

### Session Time

In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 989668 entries, 31 to 1000042
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   UserId  989668 non-null  int32
 1   ItemId  989668 non-null  int32
 2   Rating  989668 non-null  int32
 3   Time    989668 non-null  int64
dtypes: int32(3), int64(1)
memory usage: 26.4 MB


In [85]:
data['Time'] = pd.to_datetime(data['Time'],unit='s')

In [86]:
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [87]:
oldest, latest = data['Time'].min(), data['Time'].max()
print(oldest) 
print(latest)

2000-04-25 23:05:32
2003-02-28 17:49:50


### 나는 그래도 영화는 개봉을 달마다 많이하지않으므로 1년으로 설장해주었다.

In [88]:
month_ago = latest - dt.timedelta(365)     # 최종 날짜로부터 60일 이전 날짜를 구한다.  
data = data[data['Time'] > month_ago]   # 방금 구한 날짜 이후의 데이터만 모은다. 
data

Unnamed: 0,UserId,ItemId,Rating,Time
5170,36,1387,5,2002-03-12 03:46:59
5267,36,1201,4,2002-03-12 03:46:59
5122,36,1291,5,2002-03-12 03:47:16
5123,36,2167,5,2002-03-12 03:48:25
5290,36,2951,4,2002-03-12 03:48:25
...,...,...,...,...
992358,5996,3835,3,2002-04-29 20:46:24
992279,5996,2422,3,2002-04-29 20:47:05
992702,5996,168,3,2002-09-03 13:12:26
992459,5996,339,4,2002-10-07 13:24:39


In [89]:
data.isnull()

Unnamed: 0,UserId,ItemId,Rating,Time
5170,False,False,False,False
5267,False,False,False,False
5122,False,False,False,False
5123,False,False,False,False
5290,False,False,False,False
...,...,...,...,...
992358,False,False,False,False
992279,False,False,False,False
992702,False,False,False,False
992459,False,False,False,False


### Data Cleansing
#### 1.2에서 살펴보니 길이가 1인 세션도 꽤 있습니다. 우리의 목적은 유저가 최소 1개 이상 클릭했을 때 다음 클릭을 예측하는 것이므로 길이가 1인 세션은 제거해준다. 너무 적게 클릭된 아이템은 이상한 아이템일 가능성이 있다. 이 역시 제거해 준다

In [90]:
# short_session을 제거한 다음 unpopular item을 제거하면 다시 길이가 1인 session이 생길 수 있습니다.
# 이를 위해 반복문을 통해 지속적으로 제거 합니다.
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('UserId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['UserId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('ItemId').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['ItemId'].isin(item_use)]
    return data

In [91]:
data = cleanse_recursive(data, shortest=2, least_click=5)
data

Unnamed: 0,UserId,ItemId,Rating,Time
5170,36,1387,5,2002-03-12 03:46:59
5267,36,1201,4,2002-03-12 03:46:59
5122,36,1291,5,2002-03-12 03:47:16
5123,36,2167,5,2002-03-12 03:48:25
5290,36,2951,4,2002-03-12 03:48:25
...,...,...,...,...
992466,5996,3564,3,2002-04-29 20:33:12
992279,5996,2422,3,2002-04-29 20:47:05
992702,5996,168,3,2002-09-03 13:12:26
992459,5996,339,4,2002-10-07 13:24:39


### 이제 데이터를 나눠주자
### Train / Valid / Test split

In [93]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    session_last_time = data.groupby('UserId')['Time'].max()
    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(n_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(n_days)].index

    before_date = data[data['UserId'].isin(session_in_train)]
    after_date = data[data['UserId'].isin(session_in_test)]
    after_date = after_date[after_date['ItemId'].isin(before_date['ItemId'])]
    return before_date, after_date

In [94]:
tr, test = split_by_date(data, n_days=30)
tr, val = split_by_date(tr, n_days=30)

In [95]:
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["UserId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [96]:
stats_info(tr, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 9767
	 Sessions: 286
	 Items: 1503
	 First Time : 2002-02-28 19:06:39
	 Last Time : 2002-12-30 02:26:14

* valid Set Stats Info
	 Events: 2541
	 Sessions: 81
	 Items: 1124
	 First Time : 2002-02-28 23:32:05
	 Last Time : 2003-01-29 03:00:40

* test Set Stats Info
	 Events: 6135
	 Sessions: 95
	 Items: 1485
	 First Time : 2002-03-01 04:03:30
	 Last Time : 2003-02-28 17:49:50



In [97]:
# train set에 없는 아이템이 val, test기간에 생길 수 있으므로 train data를 기준으로 인덱싱합니다.
id2idx = {item_id : index for index, item_id in enumerate(tr['ItemId'].unique())}

def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))  # id2idx에 없는 아이템은 모르는 값(-1) 처리 해줍니다.
    return df

tr = indexing(tr, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [98]:
save_path = data_path / 'jang'
save_path.mkdir(parents=True, exist_ok=True)

tr.to_pickle(save_path / 'train.pkl')
val.to_pickle(save_path / 'valid.pkl')
test.to_pickle(save_path / 'test.pkl')

### 이제 데이터도 잘 나누었고, 데이터 파이프라인을 구축해주자.
### data pipeline

In [99]:
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()
        self.session_idx = np.arange(self.df['UserId'].nunique())  # indexing to SessionId

    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['UserId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('UserId').size().cumsum()
        return offsets

In [100]:
tr_dataset = SessionDataset(tr)
tr_dataset.df.head(10)

Unnamed: 0,UserId,ItemId,Rating,Time,item_idx
5170,36,1387,5,2002-03-12 03:46:59,0
5267,36,1201,4,2002-03-12 03:46:59,1
5122,36,1291,5,2002-03-12 03:47:16,2
5123,36,2167,5,2002-03-12 03:48:25,3
5290,36,2951,4,2002-03-12 03:48:25,4
5359,36,2115,5,2002-03-12 03:48:25,5
5073,36,1912,5,2002-03-12 03:48:44,6
5113,36,2662,3,2002-03-12 03:48:44,7
5366,36,2985,4,2002-03-12 03:49:01,8
5145,36,198,5,2002-03-12 03:49:48,9


In [174]:
tr_dataset.click_offsets

array([   0,   30,   37,   49,   54,   56,   58,   79,   85,  102,  113,
        162,  231,  243,  323,  358,  378,  387,  397,  403,  406,  442,
        471,  477,  498,  608,  623,  638,  640,  689,  696,  700,  702,
        844,  850,  856,  900,  908,  918,  958,  962,  970,  972,  974,
        990,  993, 1033, 1043, 1052, 1105, 1218, 1224, 1348, 1351, 1355,
       1856, 1875, 1906, 1920, 1922, 1946, 1949, 1955, 1964, 1972, 1975,
       2019, 2025, 2062, 2087, 2121, 2220, 2255, 2266, 2313, 2421, 2424,
       2440, 2458, 2518, 2531, 2595, 2618, 2631, 2636, 2858, 2888, 2898,
       2962, 2976, 3143, 3244, 3262, 3264, 3293, 3330, 3412, 3428, 3434,
       3460, 3465, 3478, 3480, 3510, 3513, 3523, 3526, 3533, 3545, 3608,
       3669, 3686, 3708, 3714, 3727, 3729, 3743, 3766, 3866, 3870, 3908,
       3919, 3937, 3967, 3980, 4043, 4066, 4115, 4121, 4260, 4324, 4334,
       4415, 4418, 4423, 4431, 4437, 4444, 4462, 4491, 4494, 4522, 4760,
       4762, 4836, 4905, 4911, 5004, 5020, 5152, 51

In [175]:
tr_dataset.session_idx


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [159]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        start, end, mask, last_session, finished = self.initialize()  # initialize 메소드에서 확인해주세요.
        """
        start : Index Where Session Start
        end : Index Where Session End
        mask : indicator for the sessions to be terminated
        """

        while not finished:
            min_len = (end - start).min() - 1  # Shortest Length Among Sessions
            for i in range(min_len):
                # Build inputs & targets
                inp = self.dataset.df['item_idx'].values[start + i]
                target = self.dataset.df['item_idx'].values[start + i + 1]
                yield inp, target, mask

            start, end, mask, last_session, finished = self.update_status(start, end, min_len, last_session, finished)

    def initialize(self):
        first_iters = np.arange(self.batch_size)    # 첫 배치에 사용할 세션 Index를 가져옵니다.
        last_session = self.batch_size - 1    # 마지막으로 다루고 있는 세션 Index를 저장해둡니다.
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]       # data 상에서 session이 시작된 위치를 가져옵니다.
        end = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]  # session이 끝난 위치 바로 다음 위치를 가져옵니다.
        mask = np.array([])   # session의 모든 아이템을 다 돌은 경우 mask에 추가해줄 것입니다.
        finished = False         # data를 전부 돌았는지 기록하기 위한 변수입니다.
        return start, end, mask, last_session, finished

    def update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):  
        # 다음 배치 데이터를 생성하기 위해 상태를 update합니다.
        
        start += min_len   # __iter__에서 min_len 만큼 for문을 돌았으므로 start를 min_len 만큼 더해줍니다.
        mask = np.arange(self.batch_size)[(end - start) == 1]  
        # end는 다음 세션이 시작되는 위치인데 start와 한 칸 차이난다는 것은 session이 끝났다는 뜻입니다. mask에 기록해줍니다.

        for i, idx in enumerate(mask, start=1):  # mask에 추가된 세션 개수만큼 새로운 세션을 돌것입니다.
            new_session = last_session + i  
            if new_session > self.dataset.session_idx[-1]:  # 만약 새로운 세션이 마지막 세션 index보다 크다면 모든 학습데이터를 돈 것입니다.
                finished = True
                break
            # update the next starting/ending point
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]     # 종료된 세션 대신 새로운 세션의 시작점을 기록합니다.
            end[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session] +1]

        last_session += len(mask)  # 마지막 세션의 위치를 기록해둡니다.
        return start, end, mask, last_session, finished

In [160]:
tr_data_loader.dataset.click_offsets

array([   0,   30,   37,   49,   54,   56,   58,   79,   85,  102,  113,
        162,  231,  243,  323,  358,  378,  387,  397,  403,  406,  442,
        471,  477,  498,  608,  623,  638,  640,  689,  696,  700,  702,
        844,  850,  856,  900,  908,  918,  958,  962,  970,  972,  974,
        990,  993, 1033, 1043, 1052, 1105, 1218, 1224, 1348, 1351, 1355,
       1856, 1875, 1906, 1920, 1922, 1946, 1949, 1955, 1964, 1972, 1975,
       2019, 2025, 2062, 2087, 2121, 2220, 2255, 2266, 2313, 2421, 2424,
       2440, 2458, 2518, 2531, 2595, 2618, 2631, 2636, 2858, 2888, 2898,
       2962, 2976, 3143, 3244, 3262, 3264, 3293, 3330, 3412, 3428, 3434,
       3460, 3465, 3478, 3480, 3510, 3513, 3523, 3526, 3533, 3545, 3608,
       3669, 3686, 3708, 3714, 3727, 3729, 3743, 3766, 3866, 3870, 3908,
       3919, 3937, 3967, 3980, 4043, 4066, 4115, 4121, 4260, 4324, 4334,
       4415, 4418, 4423, 4431, 4437, 4444, 4462, 4491, 4494, 4522, 4760,
       4762, 4836, 4905, 4911, 5004, 5020, 5152, 51

In [161]:
tr_data_loader = SessionDataLoader(tr_dataset, batch_size=4)
tr_dataset.df.head(10)

Unnamed: 0,UserId,ItemId,Rating,Time,item_idx
5170,36,1387,5,2002-03-12 03:46:59,0
5267,36,1201,4,2002-03-12 03:46:59,1
5122,36,1291,5,2002-03-12 03:47:16,2
5123,36,2167,5,2002-03-12 03:48:25,3
5290,36,2951,4,2002-03-12 03:48:25,4
5359,36,2115,5,2002-03-12 03:48:25,5
5073,36,1912,5,2002-03-12 03:48:44,6
5113,36,2662,3,2002-03-12 03:48:44,7
5366,36,2985,4,2002-03-12 03:49:01,8
5145,36,198,5,2002-03-12 03:49:48,9


In [162]:
iter_ex = iter(tr_data_loader)

In [163]:
inputs, labels, mask =  next(iter_ex)
print(f'Model Input Item Idx are : {inputs}')
print(f'Label Item Idx are : {"":5} {labels}')
print(f'Previous Masked Input Idx are {mask}')

Model Input Item Idx are : [ 0 30 37 48]
Label Item Idx are :       [ 1 31 38 49]
Previous Masked Input Idx are []


### Modeling
### Evaluation Metric

In [164]:
def mrr_k(pred, truth: int, k: int):
    indexing = np.where(pred[:k] == truth)[0]
    if len(indexing) > 0:
        return 1 / (indexing[0] + 1)
    else:
        return 0


def recall_k(pred, truth: int, k: int) -> int:
    answer = truth in pred[:k]
    return int(answer)

### Model Architecture

In [200]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GRU
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [201]:
def create_model(args):
    inputs = Input(batch_shape=(args.batch_size, 1, args.num_items))
    gru, _ = GRU(args.hsz, stateful=True, return_state=True, name='GRU')(inputs)
    dropout = Dropout(args.drop_rate)(gru)
    predictions = Dense(args.num_items, activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=[predictions])
    model.compile(loss=categorical_crossentropy, optimizer=Adam(args.lr), metrics=['accuracy'])
    model.summary()
    return model

In [211]:
class Args:
    def __init__(self, tr, val, test, batch_size, hsz, drop_rate, lr, epochs, k):
        self.tr = tr
        self.val = val
        self.test = test
        self.num_items = tr['ItemId'].nunique()
        self.num_sessions = tr['UserId'].nunique()
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

args = Args(tr, val, test, batch_size=64, hsz=50, drop_rate=0.1, lr=0.001, epochs=29, k=20)

In [212]:
model = create_model(args)

Model: "model_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        [(64, 1, 1503)]           0         
_________________________________________________________________
GRU (GRU)                    [(64, 50), (64, 50)]      233250    
_________________________________________________________________
dropout_14 (Dropout)         (64, 50)                  0         
_________________________________________________________________
dense_14 (Dense)             (64, 1503)                76653     
Total params: 309,903
Trainable params: 309,903
Non-trainable params: 0
_________________________________________________________________


### Model Training

In [213]:
# train 셋으로 학습하면서 valid 셋으로 검증합니다.
def train_model(model, args):
    train_dataset = SessionDataset(args.tr)
    train_loader = SessionDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        total_step = len(args.tr) - args.tr['UserId'].nunique()
        tr_loader = tqdm(train_loader, total=total_step // args.batch_size, desc='Train', mininterval=1)
        for feat, target, mask in tr_loader:
            reset_hidden_states(model, mask)  # 종료된 session은 hidden_state를 초기화합니다. 아래 메서드에서 확인해주세요.

            input_ohe = to_categorical(feat, num_classes=args.num_items)
            input_ohe = np.expand_dims(input_ohe, axis=1)
            target_ohe = to_categorical(target, num_classes=args.num_items)

            result = model.train_on_batch(input_ohe, target_ohe)
            tr_loader.set_postfix(train_loss=result[0], accuracy = result[1])

        val_recall, val_mrr = get_metrics(args.val, model, args, args.k)  # valid set에 대해 검증합니다.

        print(f"\t - Recall@{args.k} epoch {epoch}: {val_recall:3f}")
        print(f"\t - MRR@{args.k}    epoch {epoch}: {val_mrr:3f}\n")


def reset_hidden_states(model, mask):
    gru_layer = model.get_layer(name='GRU')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)


def get_metrics(data, model, args, k: int):  # valid셋과 test셋을 평가하는 코드입니다. 
                                             # train과 거의 같지만 mrr, recall을 구하는 라인이 있습니다.
    dataset = SessionDataset(data)
    loader = SessionDataLoader(dataset, batch_size=args.batch_size)
    recall_list, mrr_list = [], []

    total_step = len(data) - data['UserId'].nunique()
    for inputs, label, mask in tqdm(loader, total=total_step // args.batch_size, desc='Evaluation', mininterval=1):
        reset_hidden_states(model, mask)
        input_ohe = to_categorical(inputs, num_classes=args.num_items)
        input_ohe = np.expand_dims(input_ohe, axis=1)

        pred = model.predict(input_ohe, batch_size=args.batch_size)
        pred_arg = tf.argsort(pred, direction='DESCENDING')  # softmax 값이 큰 순서대로 sorting 합니다.

        length = len(inputs)
        recall_list.extend([recall_k(pred_arg[i], label[i], k) for i in range(length)])
        mrr_list.extend([mrr_k(pred_arg[i], label[i], k) for i in range(length)])

    recall, mrr = np.mean(recall_list), np.mean(mrr_list)
    return recall, mrr

In [214]:
# 학습 시간이 다소 오래 소요됩니다. 아래 주석을 풀지 마세요.
train_model(model, args)

# 학습된 모델을 불러옵니다.
#model = tf.keras.models.load_model(data_path / 'trained_model')

Train:  66%|██████▌   | 97/148 [00:02<00:01, 44.86it/s, accuracy=0, train_loss=7.29]    
Evaluation:  24%|██▎       | 9/38 [00:03<00:09,  2.93it/s]


	 - Recall@20 epoch 1: 0.062500
	 - MRR@20    epoch 1: 0.010442



Train:  66%|██████▌   | 97/148 [00:00<00:00, 103.23it/s, accuracy=0, train_loss=7.16]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.20it/s]


	 - Recall@20 epoch 2: 0.072917
	 - MRR@20    epoch 2: 0.010793



Train:  66%|██████▌   | 97/148 [00:00<00:00, 102.77it/s, accuracy=0, train_loss=7.1]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.19it/s]


	 - Recall@20 epoch 3: 0.083333
	 - MRR@20    epoch 3: 0.013568



Train:  66%|██████▌   | 97/148 [00:00<00:00, 102.19it/s, accuracy=0, train_loss=7.06]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.10it/s]


	 - Recall@20 epoch 4: 0.090278
	 - MRR@20    epoch 4: 0.013963



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.45it/s, accuracy=0, train_loss=7.04]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.16it/s]


	 - Recall@20 epoch 5: 0.088542
	 - MRR@20    epoch 5: 0.013706



Train:  66%|██████▌   | 97/148 [00:00<00:00, 105.05it/s, accuracy=0, train_loss=7.01]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.16it/s]


	 - Recall@20 epoch 6: 0.088542
	 - MRR@20    epoch 6: 0.013873



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.38it/s, accuracy=0, train_loss=6.98]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.18it/s]


	 - Recall@20 epoch 7: 0.086806
	 - MRR@20    epoch 7: 0.014990



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.44it/s, accuracy=0, train_loss=6.93]
Evaluation:  24%|██▎       | 9/38 [00:02<00:08,  3.23it/s]


	 - Recall@20 epoch 8: 0.081597
	 - MRR@20    epoch 8: 0.015097



Train:  66%|██████▌   | 97/148 [00:00<00:00, 100.95it/s, accuracy=0, train_loss=6.89]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.17it/s]


	 - Recall@20 epoch 9: 0.086806
	 - MRR@20    epoch 9: 0.016114



Train:  66%|██████▌   | 97/148 [00:00<00:00, 102.45it/s, accuracy=0.0156, train_loss=6.84]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.10it/s]


	 - Recall@20 epoch 10: 0.085069
	 - MRR@20    epoch 10: 0.017344



Train:  66%|██████▌   | 97/148 [00:00<00:00, 100.87it/s, accuracy=0.0156, train_loss=6.76]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.20it/s]


	 - Recall@20 epoch 11: 0.085069
	 - MRR@20    epoch 11: 0.019340



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.86it/s, accuracy=0.0312, train_loss=6.67]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.13it/s]


	 - Recall@20 epoch 12: 0.079861
	 - MRR@20    epoch 12: 0.020315



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.27it/s, accuracy=0.0312, train_loss=6.6]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.18it/s]


	 - Recall@20 epoch 13: 0.085069
	 - MRR@20    epoch 13: 0.018926



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.32it/s, accuracy=0.0781, train_loss=6.48]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.16it/s]


	 - Recall@20 epoch 14: 0.079861
	 - MRR@20    epoch 14: 0.018900



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.87it/s, accuracy=0.109, train_loss=6.35]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.15it/s]


	 - Recall@20 epoch 15: 0.076389
	 - MRR@20    epoch 15: 0.020671



Train:  66%|██████▌   | 97/148 [00:00<00:00, 103.23it/s, accuracy=0.125, train_loss=6.22]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.20it/s]


	 - Recall@20 epoch 16: 0.076389
	 - MRR@20    epoch 16: 0.024730



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.03it/s, accuracy=0.125, train_loss=6.07]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.16it/s]


	 - Recall@20 epoch 17: 0.074653
	 - MRR@20    epoch 17: 0.021550



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.15it/s, accuracy=0.125, train_loss=5.92]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.17it/s]


	 - Recall@20 epoch 18: 0.078125
	 - MRR@20    epoch 18: 0.023352



Train:  66%|██████▌   | 97/148 [00:00<00:00, 100.79it/s, accuracy=0.156, train_loss=5.73]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.22it/s]


	 - Recall@20 epoch 19: 0.085069
	 - MRR@20    epoch 19: 0.024430



Train:  66%|██████▌   | 97/148 [00:00<00:00, 99.64it/s, accuracy=0.188, train_loss=5.57]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.12it/s]


	 - Recall@20 epoch 20: 0.097222
	 - MRR@20    epoch 20: 0.024728



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.17it/s, accuracy=0.203, train_loss=5.39]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.19it/s]


	 - Recall@20 epoch 21: 0.095486
	 - MRR@20    epoch 21: 0.024216



Train:  66%|██████▌   | 97/148 [00:00<00:00, 102.73it/s, accuracy=0.188, train_loss=5.22]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.22it/s]


	 - Recall@20 epoch 22: 0.098958
	 - MRR@20    epoch 22: 0.025743



Train:  66%|██████▌   | 97/148 [00:00<00:00, 99.82it/s, accuracy=0.172, train_loss=5.09]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.18it/s]


	 - Recall@20 epoch 23: 0.092014
	 - MRR@20    epoch 23: 0.026294



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.96it/s, accuracy=0.281, train_loss=4.85]
Evaluation:  24%|██▎       | 9/38 [00:02<00:08,  3.27it/s]


	 - Recall@20 epoch 24: 0.090278
	 - MRR@20    epoch 24: 0.026052



Train:  66%|██████▌   | 97/148 [00:00<00:00, 103.83it/s, accuracy=0.281, train_loss=4.7]
Evaluation:  24%|██▎       | 9/38 [00:02<00:08,  3.23it/s]


	 - Recall@20 epoch 25: 0.102431
	 - MRR@20    epoch 25: 0.025269



Train:  66%|██████▌   | 97/148 [00:00<00:00, 103.71it/s, accuracy=0.328, train_loss=4.5]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.14it/s]


	 - Recall@20 epoch 26: 0.095486
	 - MRR@20    epoch 26: 0.024666



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.89it/s, accuracy=0.312, train_loss=4.33]
Evaluation:  24%|██▎       | 9/38 [00:02<00:08,  3.26it/s]


	 - Recall@20 epoch 27: 0.093750
	 - MRR@20    epoch 27: 0.025270



Train:  66%|██████▌   | 97/148 [00:00<00:00, 104.72it/s, accuracy=0.297, train_loss=4.13]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.18it/s]


	 - Recall@20 epoch 28: 0.088542
	 - MRR@20    epoch 28: 0.026044



Train:  66%|██████▌   | 97/148 [00:00<00:00, 101.51it/s, accuracy=0.328, train_loss=3.98]
Evaluation:  24%|██▎       | 9/38 [00:02<00:09,  3.21it/s]

	 - Recall@20 epoch 29: 0.088542
	 - MRR@20    epoch 29: 0.025910






In [215]:
def test_model(model, args, test):
    test_recall, test_mrr = get_metrics(test, model, args, 20)
    print(f"\t - Recall@{args.k}: {test_recall:3f}")
    print(f"\t - MRR@{args.k}: {test_mrr:3f}\n")

test_model(model, args, test)

Evaluation:  31%|███       | 29/94 [00:09<00:20,  3.13it/s]

	 - Recall@20: 0.044720
	 - MRR@20: 0.013243






### 데이터 전처리하는 과정에서 나는 최근 1년의 데이터만 썻는데 데이터량이 적어지는탓해 학습량이 충분하지 않은 문제점이있었다. 데이터를 더욱 보충하는것이 올바른 방향인것같다. 이것으로 Exploration17을 마친다.