In [1]:
import pandas as pd
import numpy as np

from copy import deepcopy
import torch
import random
import time
import json
import joblib
import pickle
import torch.utils.data as data_utils
import scipy.sparse as sp
from pathlib import Path
from sklearn.metrics import log_loss, roc_auc_score
from torch.utils.data.distributed import DistributedSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

tqdm.pandas()



In [2]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to the index of the GPU you want to use
# torch.cuda.set_per_process_memory_fraction(0.8)  # Limit memory usage to 80%

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
print(torch.cuda.is_available())

True


# Functions To Get Data

In [4]:
def select_sampler(train_data, val_data, test_data, user_count, item_count, args):
    if args.sample == 'random':
        return RandomNegativeSampler(train_data, val_data, test_data, user_count, item_count, args.negsample_size, args.seed, args.negsample_savefolder)
    elif args.sample == 'popular':
        return PopularNegativeSampler(train_data, val_data, test_data, user_count, item_count, args.negsample_size, args.seed, args.negsample_savefolder)

def mtl_data(path=None, args=None):
    if not path:
        return
    df = pd.read_csv(path, usecols=["user_id", "item_id", "click", "like", "video_category", "gender", "age", "hist_1", "hist_2",
                       "hist_3", "hist_4", "hist_5", "hist_6", "hist_7", "hist_8", "hist_9", "hist_10"])
   
    df['video_category'] = df['video_category'].astype(str)
    df = sample_data(df)
    if args.mtl_task_num == 2:
        label_columns = ['click', 'like']
        categorical_columns = ["user_id", "item_id", "video_category", "gender", "age", "hist_1", "hist_2",
                       "hist_3", "hist_4", "hist_5", "hist_6", "hist_7", "hist_8", "hist_9", "hist_10"]
    elif args.mtl_task_num == 1:
        label_columns = ['click']
        categorical_columns = ["user_id", "item_id", "video_category", "gender", "age", "hist_1", "hist_2",
                               "hist_3", "hist_4", "hist_5", "hist_6", "hist_7", "hist_8", "hist_9", "hist_10"]
    else:
        label_columns = ['like']
        categorical_columns = ["user_id", "item_id", "video_category", "gender", "age", "hist_1", "hist_2",
                               "hist_3", "hist_4", "hist_5", "hist_6", "hist_7", "hist_8", "hist_9", "hist_10"]
    user_columns = ["user_id", "gender", "age"]
    for col in tqdm(categorical_columns):
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    new_columns = categorical_columns + label_columns
    df = df.reindex(columns=new_columns)

    user_feature_dict, item_feature_dict = {}, {}
    for idx, col in tqdm(enumerate(df.columns)):
        if col not in label_columns:
            if col in user_columns:
                user_feature_dict[col] = (len(df[col].unique()), idx)
            else:
                item_feature_dict[col] = (len(df[col].unique()), idx)

    df = df.sample(frac=1)
    train_len = int(len(df) * 0.8)
    train_df = df[:train_len]
    tmp_df = df[train_len:]
    val_df = tmp_df[:int(len(tmp_df)/2)]
    test_df = tmp_df[int(len(tmp_df)/2):]
    return train_df, val_df, test_df, user_feature_dict, item_feature_dict

def set_fenbu(row_data):
    tmp1 = row_data[row_data.click.isin([0])]
    tmp2 = row_data[row_data.click.isin([1])]
    data = []
    j = 0
    for i in tqdm(range(int(len(tmp2)/1000))):
        data.append(tmp2.iloc[i, :].values.tolist())
        data.extend(tmp1.iloc[j : j + 3, :].values.tolist())
        j = j + 3
    new_data = pd.DataFrame(data, columns=row_data.columns)
    return new_data

def sample_data(df):
    p_df = df[df.click.isin([1])]
    n_df = df[df.click.isin([0])]
    del df
    n_df = n_df.sample(n=len(p_df)*2)
    df = p_df.append(n_df)
    del p_df, n_df
    df = df.sample(frac=1)
    return df


def gen_list(hist):
    data = []
    for key, value in tqdm(hist.items()):
        for v in value:
            data.append([key, v])
    return data


def get_ur(df):
    """
    Method of getting user-rating pairs
    Parameters
    ----------
    df : pd.DataFrame, rating dataframe

    Returns
    -------
    ur : dict, dictionary stored user-items interactions
    """
    print("Method of getting user-rating pairs")
    ur = df.groupby('user_id').item_id.apply(list).to_dict()
    return ur

def category_encoding(df):
    df['user_id'] = pd.Categorical(df['user_id']).codes
    df['item_id'] = pd.Categorical(df['item_id']).codes
    return df

def gen_hist_matrix(data, user_num, item_num, train_dict):
    max_len = 0
    for _, v in train_dict.items():
        if max_len < len(v):
            max_len = len(v)
    if max_len > item_num * 0.2:
        print(f'Max value of user history interaction records has reached: {max_len / item_num * 100:.4f}% of the total.')
    history_matrix = np.zeros((user_num+1, max_len), dtype=np.int64)
    history_value = np.zeros((user_num+1, max_len))
    history_len = np.zeros(user_num+1, dtype=np.int64)

    for user, item in data:
        history_matrix[user, history_len[user]] = item
        history_value[user, history_len[user]] = 1
        history_len[user] += 1
    return torch.LongTensor(history_matrix), torch.FloatTensor(history_value), torch.LongTensor(history_len)


def get_history_matrix(df, args, row='user_id', use_config_value_name=False):
    '''
    get the history interactions by user/item
    '''
    # logger = config['logger']
    assert row in df.columns, f'invalid name {row}: not in columns of history dataframe'
    # uid_name, iid_name  = config['UID_NAME'], config['IID_NAME']
    user_ids, item_ids = df['user_id'].values, df['item_id'].values
    value_name = 'click' if use_config_value_name else None

    user_num, item_num = args.num_users, args.num_items
    values = np.ones(len(df)) if value_name is None else df[value_name].values

    if row == 'user':
        row_num, max_col_num = user_num, item_num
        row_ids, col_ids = user_ids, item_ids
    else: # 'item'
        row_num, max_col_num = item_num, user_num
        row_ids, col_ids = item_ids, user_ids

    history_len = np.zeros(row_num, dtype=np.int16)
    for row_id in row_ids:
        history_len[row_id] += 1

    col_num = np.max(history_len)
    col_num = col_num.astype(int)
    if col_num > max_col_num * 0.2:
        print(f'Max value of {row}\'s history interaction records has reached: {col_num / max_col_num * 100:.4f}% of the total.')

    history_matrix = np.zeros((row_num, col_num), dtype=np.int16)
    history_value = np.zeros((row_num, col_num), dtype=np.int16)
    history_len[:] = 0
    for row_id, value, col_id in zip(row_ids, values, col_ids):
        history_matrix[row_id, history_len[row_id]] = col_id
        history_value[row_id, history_len[row_id]] = value
        history_len[row_id] += 1

    return torch.LongTensor(history_matrix), torch.FloatTensor(history_value), torch.LongTensor(history_len)

def get_inter_matrix(df, args, form='coo'):
    '''
    get the whole sparse interaction matrix
    '''
    print("get the whole sparse interaction matrix")
    user_num, item_num = args.num_users, args.num_items

    src, tar = df['user_id'].values, df['item_id'].values
    data = df['click'].values

    mat = sp.coo_matrix((data, (src, tar)), shape=(user_num, item_num))

    if form == 'coo':
        return mat
    elif form == 'csr':
        return mat.tocsr()
    else:
        raise NotImplementedError(f'Sparse matrix format [{form}] has not been implemented...')


def build_candidates_set(test_ur, train_ur, args, drop_past_inter=True):
    """
    method of building candidate items for ranking
    Parameters
    ----------
    test_ur : dict, ground_truth that represents the relationship of user and item in the test set
    train_ur : dict, this represents the relationship of user and item in the train set
    item_num : No. of all items
    cand_num : int, the number of candidates
    drop_past_inter : drop items already appeared in train set

    Returns
    -------
    test_ucands : dict, dictionary storing candidates for each user in test set
    """
    item_num = args.num_items
    candidates_num = args.cand_num

    test_ucands, test_u = [], []
    for u, r in tqdm(test_ur.items()):
        sample_num = candidates_num - len(r) if len(r) <= candidates_num else 0
        if sample_num == 0:
            samples = np.random.choice(list(r), candidates_num)
        else:
            pos_items = list(r) + list(train_ur[u]) if drop_past_inter else list(r)
            # neg_items = np.setdiff1d(np.arange(item_num), pos_items)
            # samples = np.random.choice(neg_items, size=sample_num)
            samples = []
            for _ in range(sample_num):
                item = np.random.choice(item_num)
                while item in pos_items or item in samples:
                    item = np.random.choice(item_num)
                samples.append(item)
            samples = np.array(samples)
            samples = np.concatenate((samples, list(r)), axis=None)

        test_ucands.append([u, samples])
        test_u.append(u)

    return test_u, test_ucands

class BasicDataset(data_utils.Dataset):
    def __init__(self, samples):
        '''
        convert array-like <u, i, j> / <u, i, r> / <target_i, context_i, label>

        Parameters
        ----------
        samples : np.array
            samples generated by sampler
        '''
        super(BasicDataset, self).__init__()
        self.data = samples

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index][0], self.data[index][1], self.data[index][2]


def split_warm_hot(df, item_min):
    user_counts = df.groupby('user_id').size()
    w_user_subset = np.in1d(df.user_id, user_counts[user_counts >= item_min].index)
    c_user_subset = np.in1d(df.user_id, user_counts[(user_counts <= 5) & (user_counts > 1)].index)
    w_filter_df = df[w_user_subset].reset_index(drop=True)
    c_filter_df = df[c_user_subset].reset_index(drop=True)
    return w_filter_df, c_filter_df

def transferdataset(args):
    path = args.path
    item_min = args.item_min
    if not path:
        return
    df = pd.read_csv(path, usecols=['user_id', 'item_id', 'click'])

    df = df[df.click.isin([1])]
    user_counts = df.groupby('user_id').size()
    user_subset = np.in1d(df.user_id, user_counts[user_counts >= item_min].index)
    filter_df = df[user_subset].reset_index(drop=True)

    user_count = len(set(filter_df['user_id']))
    item_count = len(set(filter_df['item_id']))

    assert (filter_df.groupby('user_id').size() < item_min).sum() == 0
    del df

    reset_ob = reset_df()
    filter_df = reset_ob.fit_transform(filter_df)

    user_history = {}
    print("+++user_history+++")
    savefile_path = Path("data/history_{}.pkl".format(args.seed))
    if savefile_path.is_file():
        with open(savefile_path, "rb") as load_f:
            user_history = pickle.load(load_f)
    else:
        for uid in tqdm(filter_df.user_id.unique()):
            dataframe = filter_df[filter_df.user_id == uid].item_id
            sequence = dataframe.values.tolist()
            user_history[uid] = sequence
        with open(savefile_path, "wb") as dump_f:
            pickle.dump(user_history, dump_f)
    return filter_df, user_history, user_count, item_count

def sequencedataset(item_min, args, path=None):
    if '2' in path:
        df = pd.read_csv(path, usecols=['user_id', 'item_id', 'like'])
        df = df[df.like.isin([1])]
    else:
        df = pd.read_csv(path, usecols=['user_id', 'item_id', 'click'])
        df = df[df.click.isin([1])]
    user_counts = df.groupby('user_id').size()
    user_subset = np.in1d(df.user_id, user_counts[user_counts >= item_min].index)
    filter_df = df[user_subset].reset_index(drop=True)

    assert (filter_df.groupby('user_id').size() < item_min).sum() == 0
    user_count = len(set(filter_df['user_id']))
    item_count = len(set(filter_df['item_id']))
    del df

    reset_ob = reset_df()
    filter_df = reset_ob.fit_transform(filter_df)
    print("+++user_history+++")
    user_history = filter_df.groupby('user_id').item_id.apply(list).to_dict()
    return filter_df, user_history, user_count, item_count

def data_count(df, item_min, target=False):
    user_counts = df.groupby('user_id').size()
    if target:
        filter_df = df
    else:
        user_subset = np.in1d(df.user_id, user_counts[user_counts >= item_min].index)
        filter_df = df[user_subset].reset_index(drop=True)

        assert (filter_df.groupby('user_id').size() < item_min).sum() == 0
    user_count = len(set(filter_df['user_id']))
    item_count = len(set(filter_df['item_id']))
    return filter_df, user_count, item_count


def construct_data(args, item_min):
    path1 = args['target_path']
    path2 = args['source_path']
    df1 = pd.read_csv(path1, usecols=['user_id', 'item_id', 'click'])
    df1 = df1[df1.click.isin([1])]
    df2 = pd.read_csv(path2, usecols=['user_id', 'item_id', 'click'])
    df2 = df2[df2.click.isin([1])]
    user_counts = df2.groupby('user_id').size()
    user_subset = np.in1d(df2.user_id, user_counts[user_counts >= item_min].index)
    df2 = df2[user_subset].reset_index(drop=True)

    assert (df2.groupby('user_id').size() < item_min).sum() == 0
    s_item_count = len(set(df2['item_id']))
    reset_ob = cold_reset_df()
    df2, df1 = reset_ob.fit_transform(df2, df1)
    user1 = set(df1.user_id.values.tolist())
    user2 = set(df2.user_id.values.tolist())
    user = user1 & user2
    df1 = df1[df1.user_id.isin(list(user))]
    df2 = df2[df2.user_id.isin(list(user))]
    new_data1 = []
    new_data2 = []
    for u in user:
        tmp_data2 = df2[df2.user_id == u][:-3].values.tolist()
        tmp_data1 = df1[df1.user_id == u].values.tolist()
        new_data1.extend(tmp_data1)
        new_data2.extend(tmp_data2)
        
    new_data1 = pd.DataFrame(new_data1, columns=df1.columns)
    new_data2 = pd.DataFrame(new_data2, columns=df2.columns)
    user_count = len(set(new_data1.user_id.values.tolist()))
    reset_item = item_reset_df()
    new_data1 = reset_item.fit_transform(new_data1)
    t_item_count = len(set(new_data1['item_id']))
    return new_data1, new_data2, user_count, t_item_count, s_item_count


def construct_ch_data(args, item_min):
    path1 = args['target_path']
    path2 = args['source_path']

    df1 = pd.read_csv(path1, usecols=['user_id', 'item_id', 'click'])
    df1 = df1[df1.click.isin([1])]
    df2 = pd.read_csv(path2, usecols=['user_id', 'item_id', 'click'])
    df2 = df2[df2.click.isin([1])]

    user_counts = df2.groupby('user_id').size()
    user_subset = np.in1d(df2.user_id, user_counts[user_counts >= item_min].index)
    df2 = df2[user_subset].reset_index(drop=True)

    assert (df2.groupby('user_id').size() < item_min).sum() == 0
    s_item_count = len(set(df2['item_id']))
    reset_ob = cold_reset_df()
    df2, df1 = reset_ob.fit_transform(df2, df1)

    user1 = set(df1.user_id.values.tolist())
    user2 = set(df2.user_id.values.tolist())
    user = user1 & user2
    df1 = df1[df1.user_id.isin(list(user))]
    df2 = df2[df2.user_id.isin(list(user))]

    # cold and hot user
    user_counts1 = df1.groupby('user_id').size()
    cold_user_ind = np.in1d(df1.user_id, user_counts1[user_counts1 <= 5].index)
    hot_user_ind = np.in1d(df1.user_id, user_counts1[user_counts1 > 5].index)

    cold_user = set(df1[cold_user_ind].user_id.values.tolist())
    hot_user = set(df1[hot_user_ind].user_id.values.tolist())

    new_data1 = []
    new_data2 = []
    for u in user:
        tmp_data2 = df2[df2.user_id == u][:-3].values.tolist()
        tmp_data1 = df1[df1.user_id == u].values.tolist()
        new_data1.extend(tmp_data1)
        new_data2.extend(tmp_data2)
    new_data1 = pd.DataFrame(new_data1, columns=df1.columns)
    new_data2 = pd.DataFrame(new_data2, columns=df2.columns)
    user_count = len(set(new_data1.user_id.values.tolist()))
    reset_item = item_reset_df()
    new_data1 = reset_item.fit_transform(new_data1)
    t_item_count = len(set(new_data1['item_id']))
    return new_data1, new_data2, user_count, t_item_count, s_item_count, cold_user, hot_user

def colddataset(item_min, args, path=None):
    target_data, source_data, user_count, t_item_count, s_item_count = construct_data(args, item_min)
    print("+++user_history+++")
    user_history = source_data.groupby('user_id').item_id.apply(list).to_dict()
    target = target_data.groupby('user_id').item_id.apply(list).to_dict()

    examples = []
    for u, t_list in tqdm(target.items()):
        for t in t_list:
            e_list = [user_history[u] + [0], t]
            examples.append(e_list)
    examples = pd.DataFrame(examples, columns=['source', 'target'])
    return examples, user_count, s_item_count, t_item_count


def utils(df, args):
    if args.user_profile == 'gender':
        df = df[df.gender != 0]
        df = df[df.click.isin([1])]
        df['gender'] = df['gender'] - 1
        user_counts = df.groupby('user_id').size()
        user_subset = np.in1d(df.user_id, user_counts[user_counts >= args.item_min].index)
        filter_df = df[user_subset].reset_index(drop=True)
    elif args.user_profile == 'age':
        df = df[df.age != 0]
        df = df[df.click.isin([1])]
        df['age'] = df['age'] - 1
        user_counts = df.groupby('user_id').size()
        user_subset = np.in1d(df.user_id, user_counts[user_counts >= args.item_min].index)
        filter_df = df[user_subset].reset_index(drop=True)

    return filter_df

def gender_df(filter_df, args):
    if args.user_profile == 'gender':
        gender_list = []
        for uid in tqdm(filter_df.user_id.unique()):
            dataframe = filter_df[filter_df.user_id == uid].item_id
            sequence = dataframe.values.tolist()
            gender = filter_df[filter_df.user_id == uid].gender.values[0]
            list = [uid, sequence, gender]
            gender_list.append(list)
        profile_df = pd.DataFrame(gender_list, columns=['uid', 'history', 'profile'])
    else:
        age_list = []
        for uid in tqdm(filter_df.user_id.unique()):
            dataframe = filter_df[filter_df.user_id == uid].item_id
            sequence = dataframe.values.tolist()
            age = filter_df[filter_df.user_id == uid].age.values[0]
            list = [uid, sequence, age]
            age_list.append(list)
        profile_df = pd.DataFrame(age_list, columns=['uid', 'history', 'profile'])

    return profile_df

def train_val_test_split(user_history):
    if not user_history:
        return
    train_history = {}
    val_history = {}
    test_history = {}
    for key, history in tqdm(user_history.items()):
        train_history[key] = history[:-2]
        val_history[key] = history[-2:-1]
        test_history[key] = history[-1:]
    return train_history, val_history, test_history

def val_test_split(user_history):
    if not user_history:
        return
    val_history = {}
    test_history = {}
    val_len = int(len(user_history) / 5)
    test_len = int(len(user_history))
    i = 0
    for key, history in tqdm(user_history.items()):
        if i < val_len:
            val_history[key] = history
            i += 1
        elif i >= val_len and i < test_len:
            test_history[key] = history
            i += 1
    return val_history, test_history

class item_reset_df(object):

    def __init__(self):
        print("=" * 10, "Initialize Reset DataFrame Object", "=" * 10)
        self.item_enc = LabelEncoder()

    def fit_transform(self, df):
        print("=" * 10, "Resetting item ids in DataFrame", "=" * 10)
        df['item_id'] = self.item_enc.fit_transform(df['item_id']) + 1
        return df

    def inverse_transform(self, df):
        df['item_id'] = self.item_enc.inverse_transform(df['item_id']) - 1
        return df

class reset_df(object):

    def __init__(self):
        print("=" * 10, "Initialize Reset DataFrame Object", "=" * 10)
        self.item_enc = LabelEncoder()
        self.user_enc = LabelEncoder()

    def fit_transform(self, df):
        print("=" * 10, "Resetting user ids and item ids in DataFrame", "=" * 10)
        df['item_id'] = self.item_enc.fit_transform(df['item_id']) + 1
        df['user_id'] = self.user_enc.fit_transform(df['user_id']) + 1
        return df

    def inverse_transform(self, df):
        df['item_id'] = self.item_enc.inverse_transform(df['item_id']) - 1
        df['user_id'] = self.user_enc.inverse_transform(df['user_id']) - 1
        return df

class cold_reset_df(object):

    def __init__(self):
        print("=" * 10, "Initialize Reset DataFrame Object", "=" * 10)
        self.item_enc1 = LabelEncoder()
        self.item_enc2 = LabelEncoder()
        self.user_enc = LabelEncoder()

    def fit_transform(self, df1, df2):
        print("=" * 10, "Resetting user ids and item ids in DataFrame", "=" * 10)
        df = pd.concat([df1['user_id'], df2['user_id']])
        df = self.user_enc.fit_transform(df) + 1
        df1['item_id'] = self.item_enc1.fit_transform(df1['item_id']) + 1
        df1['user_id'] = df[:len(df1)]
        df2['item_id'] = self.item_enc2.fit_transform(df2['item_id']) + 1
        df2['user_id'] = df[len(df1):]
        return df1, df2

    def inverse_transform(self, df):
        df['item_id'] = self.item_enc.inverse_transform(df['item_id']) - 1
        df['user_id'] = self.user_enc.inverse_transform(df['user_id']) - 1
        return df

class mtlDataSet(data_utils.Dataset):
    def __init__(self, data, args):
        self.feature = data[0]
        self.args = args
        if args.mtl_task_num == 2:
            self.label1 = data[1]
            self.label2 = data[2]
        else:
            self.label = data[1]

    def __getitem__(self, index):
        feature = self.feature[index]
        if self.args.mtl_task_num == 2:
            label1 = self.label1[index]
            label2 = self.label2[index]
            return feature, label1, label2
        else:
            label = self.label[index]
            return feature, label

    def __len__(self):
        return len(self.feature)

class ProfileDataset(data_utils.Dataset):
    def __init__(self, x, y, max_len, mask_token):
        self.seqs = x
        self.targets = y
        self.max_len = max_len
        self.mask_token = mask_token

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, index):
        seq = self.seqs[index]
        target = self.targets[index]
        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        seq_len = len(seq)
        seq_mask_len = self.max_len - seq_len
        seq = [0] * seq_mask_len + seq
        return torch.LongTensor(seq), torch.LongTensor([target])

class BuildTrainDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_prob, mask_token, num_items, rng):#
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = mask_token
        self.num_items = num_items
        self.rng = rng

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)

        tokens = seq[:-1]
        labels = seq[1:]
        #
        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]

        x_len = len(tokens)
        y_len = len(labels)

        x_mask_len = self.max_len - x_len
        y_mask_len = self.max_len - y_len


        tokens = [0] * x_mask_len + tokens
        labels = [0] * y_mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def _getseq(self, user):
        return self.u2seq[user]

class ColdDataset(data_utils.Dataset):
    def __init__(self, x, y, max_len, mask_token):
        self.seqs = x
        self.targets = y
        self.max_len = max_len
        self.mask_token = mask_token

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, index):
        seq = self.seqs[index]
        target = self.targets[index]
        seq = seq[-self.max_len:]
        seq_len = len(seq)
        seq_mask_len = self.max_len - seq_len
        seq = [0] * seq_mask_len + seq
        return torch.LongTensor(seq), torch.LongTensor([target])

class ColdEvalDataset(data_utils.Dataset):
    def __init__(self, x, y, max_len, mask_token, num_item):
        self.seqs = x
        self.targets = y
        self.max_len = max_len
        self.mask_token = mask_token
        self.num_item = num_item + 1

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, index):
        seq = self.seqs[index]
        target = self.targets[index]
        labels = [0] * self.num_item
        labels[target] = 1
        seq = seq[-self.max_len:]
        seq_len = len(seq)
        seq_mask_len = self.max_len - seq_len
        seq = [self.mask_token] * seq_mask_len + seq
        return torch.LongTensor(seq), torch.LongTensor(labels)

class pos_neg_TrainDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_token, num_items):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_token = mask_token
        self.num_items = num_items

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)
        tokens = seq[:-1]
        pos = seq[1:]

        tokens = tokens[-self.max_len:]
        pos = pos[-self.max_len:]
        seen = set(tokens)
        seen.update(pos)
        neg = []
        for _ in range(len(pos)):
            item = np.random.choice(self.num_items + 1)  #
            while item in seen or item in neg:
                item = np.random.choice(self.num_items + 1)  #
            neg.append(item)

        neg = neg[-self.max_len:]

        x_len = len(tokens)
        p_len = len(pos)
        n_len = len(neg)

        x_mask_len = self.max_len - x_len
        p_mask_len = self.max_len - p_len
        n_mask_len = self.max_len - n_len

        tokens = [self.mask_token] * x_mask_len + tokens
        pos = [self.mask_token] * p_mask_len + pos
        neg = [self.mask_token] * n_mask_len + neg

        return torch.LongTensor(tokens), torch.LongTensor(pos), torch.LongTensor(neg)#, torch.LongTensor([x_len]), torch.LongTensor([user])

    def _getseq(self, user):
        return self.u2seq[user]

class CFData(data_utils.Dataset):
    def __init__(self, features,
            num_item, train_dict, num_ng=0, is_training=None):
        """ Note that the labels are only useful when training, we thus
            add them in the ng_sample() function.
        """
        self.features_ps = features
        self.num_item = num_item
        self.train_dict = train_dict
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0 for _ in range(len(features))]

    def ng_sample(self):
        assert self.is_training, 'no need to sampling when testing'

        self.features_ng = []
        for x, value in self.train_dict.items():
            u = x
            for t in range(self.num_ng * len(value)):
                j = np.random.randint(1, self.num_item)
                while j in value:
                    j = np.random.randint(1, self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1 for _ in range(len(self.features_ps))]
        labels_ng = [0 for _ in range(len(self.features_ng))]

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training \
                    else self.features_ps
        labels = self.labels_fill if self.is_training \
                    else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item, label

class BertTrainDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_prob, mask_token, num_items, rng):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = mask_token
        self.num_items = num_items
        self.rng = rng

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self._getseq(user)

        tokens = []
        labels = []
        for s in seq:
            prob = self.rng.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob

                if prob < 0.8:
                    tokens.append(self.mask_token)
                elif prob < 0.9:
                    tokens.append(self.rng.randint(1, self.num_items))
                else:
                    tokens.append(s)

                labels.append(s)
            else:
                tokens.append(s)
                labels.append(0)

        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]

        mask_len = self.max_len - len(tokens)

        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.LongTensor(labels)

    def _getseq(self, user):
        return self.u2seq[user]

class BuildEvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, u2answer, max_len, mask_token, negative_samples):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.negative_samples = negative_samples

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user]
        answer = self.u2answer[user]
        answer = answer[-1:]
        labels = answer
        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [self.mask_token] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(labels)

class Build_neg_EvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, u2answer, max_len, mask_token, item_count, neg_samples):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.item_count = item_count
        self.neg_samples = neg_samples

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user]
        answer = self.u2answer[user]
        answer = answer[-1:]
        negs = self.neg_samples[user]

        candidates = answer + negs
        labels = [1] * len(answer) + [0] * len(negs)
        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [self.mask_token] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(candidates), torch.LongTensor(labels)

class Build_full_EvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, u2answer, max_len, mask_token, num_item):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.num_item = num_item + 1

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user][:-1]
        answer = self.u2answer[user]
        answer = answer[-1:][0]


        labels = [0] * self.num_item
        labels[answer] = 1
        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [self.mask_token] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(labels)

class new_Build_full_EvalDataset(data_utils.Dataset):
    def __init__(self, u2seq, max_len, mask_token, num_item):
        self.u2seq = u2seq
        self.users = sorted(self.u2seq.keys())
        # self.u2answer = u2answer
        self.max_len = max_len
        self.mask_token = mask_token
        self.num_item = num_item

    def __len__(self):
        return len(self.users)

    def __getitem__(self, index):
        user = self.users[index]
        seq = self.u2seq[user][:-2]
        answer = self.u2seq[user][-1:][0]

        labels = [0] * self.num_item
        labels[answer] = 1
        seq = seq + [self.mask_token]
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [self.mask_token] * padding_len + seq

        return torch.LongTensor(seq), torch.LongTensor(labels)

class AEDataset(data_utils.Dataset):
    def __init__(self, train_set, yield_col='user_id'):
        """
        covert user in train_set to array-like <u> / <i> for AutoEncoder-like algorithms
        Parameters
        ----------
        train_set : pd.DataFrame
            training set
        yield_col : string
            column name used to generate array
        """
        super(AEDataset, self).__init__()
        self.data = list(train_set[yield_col].unique())
        self.data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

class VAEDataset(data_utils.Dataset):
    def __init__(self, train_set):
        """
        covert user in train_set to array-like <u> / <i> for AutoEncoder-like algorithms
        Parameters
        ----------
        train_set : pd.DataFrame
            training set
        yield_col : string
            column name used to generate array
        """
        super(VAEDataset, self).__init__()
        self.data = train_set
        self.data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


class Cf_valDataset(data_utils.Dataset):
    def __init__(self, data):
        super(Cf_valDataset, self).__init__()
        self.user = data
        # self.data = data

    def __len__(self):
        return len(self.user)

    def __getitem__(self, index):
        user = self.user[index]
        return torch.tensor(user)#, torch.tensor(self.data[user])


class CandidatesDataset(data_utils.Dataset):
    def __init__(self, ucands):
        super(CandidatesDataset, self).__init__()
        self.data = ucands

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index][0]), torch.tensor(self.data[index][1])

def get_train_loader(dataset, args):
    if args['is_parallel']:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['train_batch_size'], sampler=DistributedSampler(dataset))
    else:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['train_batch_size'], shuffle=True, pin_memory=True)
    return dataloader

def get_val_loader(dataset, args):
    if args['is_parallel']:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['val_batch_size'], sampler=DistributedSampler(dataset))
    else:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['val_batch_size'], shuffle=False, pin_memory=True)
    return dataloader

def get_test_loader(dataset, args):
    if args['is_parallel']:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['test_batch_size'], sampler=DistributedSampler(dataset))
    else:
        dataloader = data_utils.DataLoader(dataset, batch_size=args['test_batch_size'], shuffle=False, pin_memory=True)
    return dataloader


In [5]:
def select_sampler(train_data, val_data, test_data, user_count, item_count, args):
    if args.sample == 'random':
        return RandomNegativeSampler(train_data, val_data, test_data, user_count, item_count, args.negsample_size, args.seed, args.negsample_savefolder)
    elif args.sample == 'popular':
        return PopularNegativeSampler(train_data, val_data, test_data, user_count, item_count, args.negsample_size, args.seed, args.negsample_savefolder)

In [6]:
def get_data(args):
    path = args['dataset_path']
    rng = random.Random(args['seed'])
    
    data, user_count, vocab_size, item_count = colddataset(args['item_min'], args)

    x_train, x_test, y_train, y_test = train_test_split(data.source.values.tolist(),
                                                            data.target.values.tolist(),
                                                            test_size=0.2, random_state=args['seed'])
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=args['seed'])

    args['num_users'] = user_count
    args['num_items'] = item_count
    args['num_embedding'] = vocab_size

    train_dataset, valid_dataset = ColdDataset(x_train, y_train, args['max_len'], args['pad_token']), ColdEvalDataset(
        x_val, y_val, args['max_len'], args['pad_token'], args['num_items'])
    test_dataset = ColdEvalDataset(x_test, y_test, args['max_len'], args['pad_token'], args['num_items'])
    train_dataloader = get_train_loader(train_dataset, args)
    valid_dataloader = get_val_loader(valid_dataset, args)
    test_dataloader = get_test_loader(test_dataset, args)
    return train_dataloader, valid_dataloader, test_dataloader
    

# Peter4ColdStart Model

In [7]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import uniform_, xavier_normal_, constant_, normal_

class Peter4Coldstart(nn.Module):

    def __init__(self, args):
        super(Peter4Coldstart, self).__init__()

        # load parameters info
        self.embedding_size = args['embedding_size']
        self.residual_channels = args['embedding_size'] 
        self.block_num = args['block_num']
        self.dilations = args['dilations'] * self.block_num
        self.kernel_size = args['kernel_size']
        self.output_dim = args['num_items']
        self.vocab_size = args['num_embedding']
        self.is_mp = args['is_mp']

        self.pad_token = args['pad_token']

        # define layers and loss
        self.item_embedding = nn.Embedding(self.vocab_size+1, self.embedding_size, padding_idx=self.pad_token)
    
        # residual blocks
        rb = [
            ResidualBlock_b_2mp_parallel(
                self.residual_channels, self.residual_channels, kernel_size=self.kernel_size, dilation=dilation, is_mp=self.is_mp
            ) for dilation in self.dilations
        ]
        self.residual_blocks = nn.Sequential(*rb)

        # fully-connected layer
        self.final_layer = nn.Linear(self.residual_channels, self.output_dim+1)
        # parameters initialization
        # self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            stdv = np.sqrt(1. / (self.output_dim+1))
            uniform_(module.weight.data, -stdv, stdv)
        elif isinstance(module, nn.Linear):
            # xavier_normal_(module.weight.data)
            normal_(module.weight.data, 0.0, 0.1)
            if module.bias is not None:
                constant_(module.bias.data, 0.1)

    def forward(self, item_seq):#, pos, neg
        item_seq_emb = self.item_embedding(item_seq)  # [batch_size, seq_len, embed_size]
        # Residual locks
        dilate_outputs = self.residual_blocks(item_seq_emb)
        seq_output = self.final_layer(dilate_outputs)  # [batch_size, embedding_size]hidden
        return seq_output

    def predict(self, item_seq, item):
        item_seq_emb = self.item_embedding(item_seq)  # [batch_size, seq_len, embed_size]
        dilate_outputs = self.residual_blocks(item_seq_emb)
        item_embs = self.item_embedding(item)
        logits = dilate_outputs.matmul(item_embs.transpose(1, 2))
        logits = logits.mean(1)
        return logits


class mp(nn.Module):
    def __init__(self, channel):
        super(mp, self).__init__()
        self.hidden_size = int(channel / 4)
        self.conv1 = nn.Conv1d(channel, self.hidden_size, 1)
        self.conv2 = nn.Conv1d(self.hidden_size, channel, 1)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.leaky_relu(x)
        x = self.conv2(x)
        x = x.permute(0, 2, 1)
        return x
    

    
class ResidualBlock_a(nn.Module):
    r"""
    Residual block (a) in the paper
    """

    def __init__(self, in_channel, out_channel, kernel_size=3, dilation=None):
        super(ResidualBlock_a, self).__init__()

        half_channel = out_channel // 2
        self.ln1 = nn.LayerNorm(out_channel, eps=1e-8)
        self.conv1 = nn.Conv2d(in_channel, half_channel, kernel_size=(1, 1), padding=0)

        self.ln2 = nn.LayerNorm(half_channel, eps=1e-8)
        self.conv2 = nn.Conv2d(half_channel, half_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation)

        self.ln3 = nn.LayerNorm(half_channel, eps=1e-8)
        self.conv3 = nn.Conv2d(half_channel, out_channel, kernel_size=(1, 1), padding=0)

        self.dilation = dilation
        self.kernel_size = kernel_size

    def forward(self, x):  # x: [batch_size, seq_len, embed_size]

        out = F.relu(self.ln1(x))
        print(2.1)
        print(out.shape)
        out = out.permute(0, 2, 1).unsqueeze(2)
        print(2.2)
        print(out.shape)
        out = self.conv1(out).squeeze(2).permute(0, 2, 1)
        print(2.3)
        out2 = F.relu(self.ln2(out))
        out2 = self.conv_pad(out2, self.dilation)
        out2 = self.conv2(out2).squeeze(2).permute(0, 2, 1)

        out3 = F.relu(self.ln3(out2))
        out3 = out3.permute(0, 2, 1).unsqueeze(2)
        out3 = self.conv3(out3).squeeze(2).permute(0, 2, 1)
        return out3 + x

    def conv_pad(self, x, dilation):  # x: [batch_size, seq_len, embed_size]
        r""" Dropout-mask: To avoid the future information leakage problem, this paper proposed a masking-based dropout
        trick for the 1D dilated convolution to prevent the network from seeing the future items.
        Also the One-dimensional transformation is completed in this function.
        """
        inputs_pad = x.permute(0, 2, 1)  # [batch_size, embed_size, seq_len]
        inputs_pad = inputs_pad.unsqueeze(2)  # [batch_size, embed_size, 1, seq_len]
        pad = nn.ZeroPad2d(((self.kernel_size - 1) * dilation, 0, 0, 0))
        # padding operation  args：(left,right,top,bottom)
        inputs_pad = pad(inputs_pad)  # [batch_size, embed_size, 1, seq_len+(self.kernel_size-1)*dilations]
        return inputs_pad

    
class ResidualBlock_b_2mp_parallel(nn.Module):
    r"""
    Residual block (b) in the paper
    """

    def __init__(self, in_channel, out_channel, kernel_size=3, dilation=None, is_mp=False):
        super(ResidualBlock_b_2mp_parallel, self).__init__()

        self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation)
        self.ln1 = nn.LayerNorm(out_channel, eps=1e-8)
        self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation * 2)
        self.ln2 = nn.LayerNorm(out_channel, eps=1e-8)
        
        self.dilation = dilation
        self.kernel_size = kernel_size
        self.is_mp = is_mp
        self.rez = nn.Parameter(torch.FloatTensor([1]))
        if self.is_mp:
            self.mp1 = mp(in_channel)
            self.mp2 = mp(in_channel)

    def forward(self, x):  # x: [batch_size, seq_len, embed_size]
        x_pad = self.conv_pad(x, self.dilation)  # [batch_size, embeding_size, 1, seq_len+(self.kernel_size-1)*dilations]
        out = self.conv1(x_pad).squeeze(2).permute(0, 2, 1)
        # [batch_size, seq_len+(self.kernel_size-1)*dilations-kernel_size+1, embed_size]
        
        if self.is_mp:
            mp_out = self.mp1(x)
            out = mp_out + out
        out = F.relu(self.ln1(out))
        out_pad = self.conv_pad(out, self.dilation * 2)
        out2 = self.conv2(out_pad).squeeze(2).permute(0, 2, 1)
        if self.is_mp:
            mp_out2 = self.mp2(out)
            out2 = mp_out2 + out2
        out2 = F.relu(self.ln2(out2))
        return out2 * self.rez + x

    def conv_pad(self, x, dilation):
        r""" Dropout-mask: To avoid the future information leakage problem, this paper proposed a masking-based dropout
        trick for the 1D dilated convolution to prevent the network from seeing the future items.
        Also the One-dimensional transformation is completed in this function.
        """
        inputs_pad = x.permute(0, 2, 1)
        inputs_pad = inputs_pad.unsqueeze(2)
        pad = nn.ZeroPad2d(((self.kernel_size - 1) * dilation, 0, 0, 0))
        inputs_pad = pad(inputs_pad)
        return inputs_pad

class ResidualBlock_b_2mp_serial(nn.Module):
    r"""
    Residual block (b) in the paper
    """

    def __init__(self, in_channel, out_channel, kernel_size=3, dilation=None, is_mp=False):
        super(ResidualBlock_b_2mp_serial, self).__init__()

        self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation)
        self.ln1 = nn.LayerNorm(out_channel, eps=1e-8)
        self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation * 2)
        self.ln2 = nn.LayerNorm(out_channel, eps=1e-8)
        # self.mp = mp(in_channel)
        self.dilation = dilation
        self.kernel_size = kernel_size
        self.is_mp = is_mp
        if self.is_mp:
            self.mp1 = mp(in_channel)
            self.mp2 = mp(in_channel)

    def forward(self, x):  # x: [batch_size, seq_len, embed_size]
        x_pad = self.conv_pad(x, self.dilation)  # [batch_size, embed_size, 1, seq_len+(self.kernel_size-1)*dilations]
        out = self.conv1(x_pad).squeeze(2).permute(0, 2, 1)
        # [batch_size, seq_len+(self.kernel_size-1)*dilations-kernel_size+1, embed_size]
        if self.is_mp:
            mp_out = self.mp1(x)
            out = mp_out
        out = F.relu(self.ln1(out))
        out_pad = self.conv_pad(out, self.dilation * 2)
        out2 = self.conv2(out_pad).squeeze(2).permute(0, 2, 1)
        if self.is_mp:
            mp_out2 = self.mp2(out)
            out2 = mp_out2
        out2 = F.relu(self.ln2(out2))
        return out2 + x

    def conv_pad(self, x, dilation):
        r""" Dropout-mask: To avoid the future information leakage problem, this paper proposed a masking-based dropout
        trick for the 1D dilated convolution to prevent the network from seeing the future items.
        Also the One-dimensional transformation is completed in this function.
        """
        inputs_pad = x.permute(0, 2, 1)
        inputs_pad = inputs_pad.unsqueeze(2)
        pad = nn.ZeroPad2d(((self.kernel_size - 1) * dilation, 0, 0, 0))
        inputs_pad = pad(inputs_pad)
        return inputs_pad

class ResidualBlock_b_mp_serial(nn.Module):
    r"""
    Residual block (b) in the paper
    """

    def __init__(self, in_channel, out_channel, kernel_size=3, dilation=None, is_mp=False):
        super(ResidualBlock_b_mp_serial, self).__init__()

        self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation)
        self.ln1 = nn.LayerNorm(out_channel, eps=1e-8)
        self.conv2 = nn.Conv2d(out_channel, out_channel, kernel_size=(1, kernel_size), padding=0, dilation=dilation * 2)
        self.ln2 = nn.LayerNorm(out_channel, eps=1e-8)
        self.dilation = dilation
        self.kernel_size = kernel_size
        self.is_mp = is_mp
        if self.is_mp:
            self.mp = mp(in_channel)

    def forward(self, x):  # x: [batch_size, seq_len, embed_size]
        x_pad = self.conv_pad(x, self.dilation)  # [batch_size, embed_size, 1, seq_len+(self.kernel_size-1)*dilations]
        out = self.conv1(x_pad).squeeze(2).permute(0, 2, 1)
        # [batch_size, seq_len+(self.kernel_size-1)*dilations-kernel_size+1, embed_size]
        out = F.relu(self.ln1(out))
        out_pad = self.conv_pad(out, self.dilation * 2)
        out2 = self.conv2(out_pad).squeeze(2).permute(0, 2, 1)
        out2 = F.relu(self.ln2(out2))
        if self.is_mp:
            mp_out2 = self.mp(out)
            out2 = mp_out2
        return out2 + x

    def conv_pad(self, x, dilation):
        r""" Dropout-mask: To avoid the future information leakage problem, this paper proposed a masking-based dropout
        trick for the 1D dilated convolution to prevent the network from seeing the future items.
        Also the One-dimensional transformation is completed in this function.
        """
        inputs_pad = x.permute(0, 2, 1)
        inputs_pad = inputs_pad.unsqueeze(2)
        pad = nn.ZeroPad2d(((self.kernel_size - 1) * dilation, 0, 0, 0))
        inputs_pad = pad(inputs_pad)
        return inputs_pad

In [8]:
def get_model(args, linear_feature_columns=None, dnn_feature_columns=None, history_feature_list=None):
    return Peter4Coldstart(args)


In [9]:
def set_seed(seed, re=True):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    if re:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    else:
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

# Trainer

In [10]:
def Sequence_full_Validate(epoch, model, dataloader, writer, args, test=False):
    print("+" * 20, "Valid Epoch {}".format(epoch + 1), "+" * 20)
    model.eval()
    avg_metrics = {}
    i = 0
    with torch.no_grad():
        tqdm_dataloader = dataloader
        for data in tqdm_dataloader:
            data = [x.to(args['device']) for x in data]
            seqs, labels = data
            if test:
                scores = model.predict(seqs)
            else:
                scores = model(seqs)
            scores = scores.mean(1)
            metrics = recalls_and_ndcgs_for_ks(scores, labels, args['metric_ks'], args)
            i += 1
            for key, value in metrics.items():
                if key not in avg_metrics:
                    avg_metrics[key] = value
                else:
                    avg_metrics[key] += value
    for key, value in avg_metrics.items():
        avg_metrics[key] = value / i
    print(avg_metrics)
    for k in sorted(args['metric_ks'], reverse=True):
        writer.add_scalar('Train/NDCG@{}'.format(k), avg_metrics['NDCG@%d' % k], epoch)
    return avg_metrics


In [11]:
def recalls_and_ndcgs_for_ks(scores, labels, ks, args):
    metrics = {}

    answer_count = labels.sum(1)
    answer_count_float = answer_count.float()
    labels_float = labels.float()
    rank = (-scores).argsort(dim=0)
    cut = rank
    for k in sorted(ks, reverse=True):
       cut = cut[:, :k]
       hits = torch.gather(labels_float,1,cut)
       metrics['Recall@%d' % k] = (hits.sum(1) / answer_count_float).mean().item()

       position = torch.arange(2, 2+k)
       weights = 1 / torch.log2(position.float()).to(args['device'])
       dcg = (hits * weights).sum(1)
       idcg = torch.Tensor([weights[:min(n, k)].sum() for n in answer_count]).to(args['device'])
       ndcg = (dcg / idcg).mean()
       metrics['NDCG@%d' % k] = ndcg

    return metrics

In [12]:

def SequenceTrainer(epoch, model, dataloader, optimizer, writer, args): #schedular,
    print("+" * 20, "Train Epoch {}".format(epoch + 1), "+" * 20)
    model.train()
    running_loss = 0
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    for data in dataloader:
        optimizer.zero_grad()
        data = [x.to(args['device']) for x in data]
        seqs, labels = data
        logits = model(seqs) # B x T x V
        logits = logits.mean(1)
        labels = labels.view(-1)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.detach().cpu().item()
    writer.add_scalar('Train/loss', running_loss / len(dataloader), epoch)
    print("Training CE Loss: {:.5f}".format(running_loss / len(dataloader)))
    return optimizer



In [13]:
def SeqTrain(epochs, model, train_loader, val_loader, writer, args):
    if args['is_pretrain'] == 0:
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                                     lr=args['lr'], weight_decay=args['weight_decay'])
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])

    model = model.to(args['device'])
    if args['is_parallel']:
        model = torch.nn.parallel.DistributedDataParallel(model,  find_unused_parameters=True,device_ids=[args['local_rank']], output_device=[args['local_rank']])
    best_metric = 0
    all_time = 0
    val_all_time = 0
    for epoch in range(epochs):
        since = time.time()
        optimizer = SequenceTrainer(epoch, model, train_loader, optimizer, writer, args)
        tmp = time.time() - since
        print('one epoch train:', tmp)
        all_time += tmp
        val_since = time.time()
        metrics = Sequence_full_Validate(epoch, model, val_loader, writer, args)
        val_tmp = time.time() - val_since
        print('one epoch val:', val_tmp)
        val_all_time += val_tmp
        if args['is_pretrain'] == 0 and 'acc' in args['task_name']:
            if metrics['NDCG@20'] >= 0.0193:
                break
        i = 1
        current_metric = metrics['NDCG@5']
        if best_metric <= current_metric:
            best_metric = current_metric
            best_model = deepcopy(model)
            state_dict = model.state_dict()
            if 'life' in args['task_name']:
                torch.save(state_dict, os.path.join(args['save_path'],
                                                         '{}_{}_seed{}_task_{}_best_model.pth'.format('sequence',
                                                                                                      args['model_name'],
                                                                                                      args['seed'],
                                                                                                      args['task'])))
            else:
                torch.save(state_dict, os.path.join(args['save_path'], '{}_{}_seed{}_is_pretrain_{}_best_model_lr{}_wd{}_block{}_hd{}_emb{}.pth'.format(args['task_name'], args['model_name'], args['seed'], args['is_pretrain'],
                                                                                                                              args['lr'], args['weight_decay'], args['block_num'], args['hidden_size'], args['embedding_size'])))
        else:
            i += 1
            if i == 10:
                print('early stop!')
                break
    print('train_time:', all_time)
    print('val_time:', val_all_time)
    return best_model

# Model Trials

## Trial 1

- We will train this trial with (learning rate = 0.001, epochs = 10 and batch_size = 150).
- We will train the model with a sample of sbr_data_1M (1000000 samples), as it crashes a larger number of samples.

In [14]:
checkpoint_dir = "./checkpoint_trial1"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
args = {
    'seed': 0,
    'task_name': '',
    'task_num': 4,
    'dataset_path': '',
    'pretrain_path': '',
    'source_path': '',
    'target_path': '',
    'train_batch_size': 150,
    'val_batch_size': 150,
    'test_batch_size': 150,
    'sample': 'random',
    'negsample_savefolder': './data/neg_data/',
    'negsample_size': 99,
    'max_len': 20,
    'item_min': 10,
    'save_path': checkpoint_dir,
    'task': -1,
    'valid_rate': 100,
    'model_name': 'Peter4Coldstart',
    'epochs': 10,
    're_epochs': 20,
    'lr': 0.001,
    'device': 'cuda',
    'is_parallel': False,
    'local_rank': None,
    'num_gpu': 1,
    'weight_decay': 0.0,
    'decay_step': 5,
    'gamma': 0.5,
    'num_users': 1,
    'num_items': 1,
    'num_embedding': 1,
    'num_labels': 1,
    'k': 20,
    'metric_ks': [5, 20],
    'best_metric': 'NDCG@10',
    'hidden_size': 128,
    'block_num': 2,
    'num_groups': 4,
    'num_heads': 4,
    'dropout': 0.3,
    'bert_mask_prob': 0.3,
    'factor_num': 128,
    'embedding_size': 128,
    'dilations': [1, 4],
    'kernel_size': 3,
    'is_mp': False,
    'pad_token': 0,
    'temp': 7,
    'l2_emb': 0.0,
    'mtl_task_num': 1,
    'test_method': 'ufo',
    'val_method': 'ufo',
    'test_size': 0.1,
    'val_size': 0.1111,
    'cand_num': 100,
    'sample_method': 'high-pop',
    'sample_ratio': 0.3,
    'num_ng': 4,
    'loss_type': 'BPR',
    'init_method': 'default',
    'optimizer': 'default',
    'early_stop': True,
    'reg_1': 0.0,
    'reg_2': 0.0,
    'context_window': 2,
    'rho': 0.5,
    'node_dropout': 0.1,
    'mess_dropout': 0.1,
    'hidden_size_list': [128, 128],
    'latent_dim': 128,
    'anneal_cap': 0.2,
    'total_anneal_steps': 1000,
    'kd': False,
    'alpha': 0.4,
    'add_num_times': 2,
    'is_pretrain': 1,
    'user_profile': 'gender',
    'prun_rate': 0,
    'll_max_itemnum': 0,
    'lifelong_eval': True,
    'task1_out': 0,
    'task2_out': 0,
    'task3_out': 0,
    'task4_out': 0,
    'eval': True,
    'ch':True,
}
if args['is_parallel']:
    torch.distributed.init_process_group(backend="nccl")
    torch.cuda.set_device(args['local_rank'])
device = torch.device(args['device'])
set_seed(args['seed'])
writer = SummaryWriter()
print('=============cold_start=============')
args['source_path'] = '/kaggle/input/tenrecdatasets/sbr_data_1M_sampled.csv'
args['target_path'] = '/kaggle/input/tenrecdata/cold_data_0.7.csv'
train_loader, val_loader, test_loader = get_data(args) #, user_noclick

+++user_history+++


100%|██████████| 38/38 [00:00<00:00, 51265.21it/s]


In [15]:
print("pretrain")
model = get_model(args)
SeqTrain(args['epochs'], model, train_loader, val_loader, writer, args) #, user_noclick
writer.close()

if args['eval']:
    model = get_model(args)
    best_weight = torch.load(os.path.join(args['save_path'],
                                              '{}_{}_seed{}_is_pretrain_{}_best_model_lr{}_wd{}_block{}_hd{}_emb{}.pth'.format(
                                                  args['task_name'], args['model_name'], args['seed'], 
                                                  args['is_pretrain'], args['lr'], args['weight_decay'], 
                                                  args['block_num'], args['hidden_size'], args['embedding_size'])))
    
    model.load_state_dict(best_weight)
    model = model.to(args['device'])
    metrics = Sequence_full_Validate(0, model, test_loader, writer, args)


pretrain
++++++++++++++++++++ Train Epoch 1 ++++++++++++++++++++
Training CE Loss: 7.18329
one epoch train: 5.381599187850952
++++++++++++++++++++ Valid Epoch 1 ++++++++++++++++++++
{'Recall@20': 0.014084506779909134, 'NDCG@20': tensor(0.0035, device='cuda:0'), 'Recall@5': 0.0, 'NDCG@5': tensor(0., device='cuda:0')}
one epoch val: 0.09839940071105957
++++++++++++++++++++ Train Epoch 2 ++++++++++++++++++++
Training CE Loss: 6.61068
one epoch train: 0.10537266731262207
++++++++++++++++++++ Valid Epoch 2 ++++++++++++++++++++
{'Recall@20': 0.0, 'NDCG@20': tensor(0., device='cuda:0'), 'Recall@5': 0.0, 'NDCG@5': tensor(0., device='cuda:0')}
one epoch val: 0.021085023880004883
++++++++++++++++++++ Train Epoch 3 ++++++++++++++++++++
Training CE Loss: 6.38680
one epoch train: 0.10496187210083008
++++++++++++++++++++ Valid Epoch 3 ++++++++++++++++++++
{'Recall@20': 0.014084506779909134, 'NDCG@20': tensor(0.0036, device='cuda:0'), 'Recall@5': 0.0, 'NDCG@5': tensor(0., device='cuda:0')}
one epoch 

## Results
### we can summarize the evaluation results on Testing as:

#### Recall@20 : 0.014084506779909134

#### NDCG@20 : 0.0061

#### Recall@5 : 0.014084506779909134

#### NDCG@5 : 0.0061

## Trial 2

- We will train this trial with (learning rate = 0.01, epochs = 5 and batch_size = 50).
- We will train the model with a sample of sbr_data_1M (1000000 samples), as it crashes a larger number of samples.

In [16]:
checkpoint_dir = "./checkpoint2"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
args = {
    'seed': 0,
    'task_name': '',
    'task_num': 4,
    'dataset_path': '',
    'pretrain_path': '',
    'source_path': '',
    'target_path': '',
    'train_batch_size': 50,
    'val_batch_size': 50,
    'test_batch_size': 50,
    'sample': 'random',
    'negsample_savefolder': './data/neg_data/',
    'negsample_size': 99,
    'max_len': 20,
    'item_min': 10,
    'save_path': checkpoint_dir,
    'task': -1,
    'valid_rate': 100,
    'model_name': 'Peter4Coldstart',
    'epochs': 5,
    're_epochs': 20,
    'lr': 0.01,
    'device': 'cuda',
    'is_parallel': False,
    'local_rank': None,
    'num_gpu': 1,
    'weight_decay': 0.0,
    'decay_step': 5,
    'gamma': 0.5,
    'num_users': 1,
    'num_items': 1,
    'num_embedding': 1,
    'num_labels': 1,
    'k': 20,
    'metric_ks': [5, 20],
    'best_metric': 'NDCG@10',
    'hidden_size': 128,
    'block_num': 2,
    'num_groups': 4,
    'num_heads': 4,
    'dropout': 0.3,
    'bert_mask_prob': 0.3,
    'factor_num': 128,
    'embedding_size': 128,
    'dilations': [1, 4],
    'kernel_size': 3,
    'is_mp': False,
    'pad_token': 0,
    'temp': 7,
    'l2_emb': 0.0,
    'mtl_task_num': 1,
    'test_method': 'ufo',
    'val_method': 'ufo',
    'test_size': 0.1,
    'val_size': 0.1111,
    'cand_num': 100,
    'sample_method': 'high-pop',
    'sample_ratio': 0.3,
    'num_ng': 4,
    'loss_type': 'BPR',
    'init_method': 'default',
    'optimizer': 'default',
    'early_stop': True,
    'reg_1': 0.0,
    'reg_2': 0.0,
    'context_window': 2,
    'rho': 0.5,
    'node_dropout': 0.1,
    'mess_dropout': 0.1,
    'hidden_size_list': [128, 128],
    'latent_dim': 128,
    'anneal_cap': 0.2,
    'total_anneal_steps': 1000,
    'kd': False,
    'alpha': 0.4,
    'add_num_times': 2,
    'is_pretrain': 1,
    'user_profile': 'gender',
    'prun_rate': 0,
    'll_max_itemnum': 0,
    'lifelong_eval': True,
    'task1_out': 0,
    'task2_out': 0,
    'task3_out': 0,
    'task4_out': 0,
    'eval': True,
    'ch':True,
}
if args['is_parallel']:
    torch.distributed.init_process_group(backend="nccl")
    torch.cuda.set_device(args['local_rank'])
device = torch.device(args['device'])
set_seed(args['seed'])
writer = SummaryWriter()
print('=============cold_start=============')
args['source_path'] = '/kaggle/input/tenrecdatasets/sbr_data_1M_sampled.csv'
args['target_path'] = '/kaggle/input/tenrecdata/cold_data_0.7.csv'
train_loader, val_loader, test_loader = get_data(args) #, user_noclick

+++user_history+++


100%|██████████| 38/38 [00:00<00:00, 39697.02it/s]


In [17]:
print("pretrain")
model = get_model(args)
SeqTrain(args['epochs'], model, train_loader, val_loader, writer, args) #, user_noclick
writer.close()

if args['eval']:
    model = get_model(args)
    best_weight = torch.load(os.path.join(args['save_path'],
                                              '{}_{}_seed{}_is_pretrain_{}_best_model_lr{}_wd{}_block{}_hd{}_emb{}.pth'.format(
                                                  args['task_name'], args['model_name'], args['seed'], 
                                                  args['is_pretrain'], args['lr'], args['weight_decay'], 
                                                  args['block_num'], args['hidden_size'], args['embedding_size'])))
    
    model.load_state_dict(best_weight)
    model = model.to(args['device'])
    metrics = Sequence_full_Validate(0, model, test_loader, writer, args)


pretrain
++++++++++++++++++++ Train Epoch 1 ++++++++++++++++++++
Training CE Loss: 10.14783
one epoch train: 0.16315174102783203
++++++++++++++++++++ Valid Epoch 1 ++++++++++++++++++++
{'Recall@20': 0.029999999329447746, 'NDCG@20': tensor(0.0153, device='cuda:0'), 'Recall@5': 0.009999999776482582, 'NDCG@5': tensor(0.0100, device='cuda:0')}
one epoch val: 0.03486299514770508
++++++++++++++++++++ Train Epoch 2 ++++++++++++++++++++
Training CE Loss: 6.86433
one epoch train: 0.15017223358154297
++++++++++++++++++++ Valid Epoch 2 ++++++++++++++++++++
{'Recall@20': 0.009999999776482582, 'NDCG@20': tensor(0.0024, device='cuda:0'), 'Recall@5': 0.0, 'NDCG@5': tensor(0., device='cuda:0')}
one epoch val: 0.02913689613342285
++++++++++++++++++++ Train Epoch 3 ++++++++++++++++++++
Training CE Loss: 6.26988
one epoch train: 0.17031502723693848
++++++++++++++++++++ Valid Epoch 3 ++++++++++++++++++++
{'Recall@20': 0.0, 'NDCG@20': tensor(0., device='cuda:0'), 'Recall@5': 0.0, 'NDCG@5': tensor(0., devic

## Results
### we can summarize the evaluation results on Testing as:

#### Recall@20 : 0.009999999776482582

#### NDCG@20 : 0.0036

#### Recall@5 : 0.0

#### NDCG@5 : 0.
- As we can notice the performance of privous tunning is better than this one.