In [509]:
import pandas as pd
import os
TRAIN_SUFFIX = '.train.csv'
VALIDATION_SUFFIX = '.validation.csv'
TEST_SUFFIX = '.test.csv'  # 测试集文件后缀

INFO_SUFFIX = '.info.json'  # 数据集统计信息文件后缀
USER_SUFFIX = '.user.csv'  # 数据集用户特征文件后缀
ITEM_SUFFIX = '.item.csv'  # 数据集物品特征文件后缀

TRAIN_POS_SUFFIX = '.train_pos.csv'  # 训练集用户正向交互按uid合并之后的文件后缀
VALIDATION_POS_SUFFIX = '.validation_pos.csv'  # 验证集用户正向交互按uid合并之后的文件后缀
TEST_POS_SUFFIX = '.test_pos.csv'  # 测试集用户正向交互按uid合并之后的文件后缀

TRAIN_NEG_SUFFIX = '.train_neg.csv'  # 训练集用户负向交互按uid合并之后的文件后缀
VALIDATION_NEG_SUFFIX = '.validation_neg.csv'  # 验证集用户负向交互按uid合并之后的文件后缀
TEST_NEG_SUFFIX = '.test_neg.csv'  # 测试集用户负向交互按uid合并之后的文件后缀

class DataLoader(object):
    
    def __init__(self, path, dataset, label, sep='\t'):
        self.dataset = dataset
        self.path = os.path.join(path, dataset)
        
        self.train_file = os.path.join(self.path, dataset + TRAIN_SUFFIX)
        self.validation_file = os.path.join(self.path, dataset + VALIDATION_SUFFIX)
        self.test_file = os.path.join(self.path, dataset + TEST_SUFFIX)
        
        self.info_file = os.path.join(self.path, dataset + INFO_SUFFIX)
        self.user_file = os.path.join(self.path, dataset + USER_SUFFIX)
        self.item_file = os.path.join(self.path, dataset + ITEM_SUFFIX)
        
        self.train_pos_file = os.path.join(self.path, dataset + TRAIN_POS_SUFFIX)
        self.validation_pos_file = os.path.join(self.path, dataset + VALIDATION_POS_SUFFIX)
        self.test_pos_file = os.path.join(self.path, dataset + TEST_POS_SUFFIX)
        
        self.train_neg_file = os.path.join(self.path, dataset + TRAIN_NEG_SUFFIX)
        self.validation_neg_file = os.path.join(self.path, dataset + VALIDATION_NEG_SUFFIX)
        self.test_neg_file = os.path.join(self.path, dataset + TEST_NEG_SUFFIX)
        
        self.label = label
        
        self.train_df, self.validation_df, self.test_df = None, None, None
        self.load_user_item()
        self.load_data()
        self.load_his()
        self.load_info()
        #self.save_info()
    
    def load_user_item(self):
        self.user_df, self.item_df = None, None
        if os.path.exists(self.user_file):
            self.user_df = pd.read_csv(self.user_file, sep='\t')
        if os.path.exists(self.item_file):
            self.item_df = pd.read_csv(self.item_file, sep='\t')
        
    def load_data(self):
        self.train_df = pd.read_csv(self.train_file, sep='\t')
        self.validation_df = pd.read_csv(self.validation_file, sep='\t')
        self.test_df = pd.read_csv(self.test_file, sep='\t')
        
    def load_his(self):
        # 把 df [uid, iids] 变成 dict {1: [iid, iid, ...] 2: [iid, iid, ...]}
        def build_his(df):
            uids = df['uid'].tolist()
            
            iids = df['iids'].astype(str).str.split(',').values
            
            iids = [[int(j) for j in i] for i in iids]
            user_his = dict(zip(uids, iids))
            return user_his
        # 把 df [uid, iids] 变成 dict {1: [iid, iid, ...] 2: [iid, iid, ...]}
        self.train_pos_df = pd.read_csv(self.train_pos_file, sep='\t')
        self.train_user_pos = build_his(self.train_pos_df)
        
        self.validation_pos_df = pd.read_csv(self.validation_pos_file, sep='\t')
        self.validation_user_pos = build_his(self.validation_pos_df)
        
        self.test_pos_df = pd.read_csv(self.test_pos_file, sep='\t')
        self.test_user_pos = build_his(self.test_pos_df)
        
        self.train_neg_df = pd.read_csv(self.train_neg_file, sep='\t')
        self.train_user_neg = build_his(self.train_neg_df)
        
        self.validation_neg_df = pd.read_csv(self.validation_neg_file, sep='\t')
        self.validation_user_neg = build_his(self.validation_neg_df)
        
        self.test_neg_df = pd.read_csv(self.test_neg_file, sep='\t')
        self.test_user_neg = build_his(self.test_neg_df)
        
    def append_his(self, max_his=10):
        # 包含了 train, validation, test 中的所有的 uid 和 其对应的 iids
        # 正样本是 iid, 负样本是 -iid
        his_dict = {}
        
        for df in [self.train_df, self.validation_df, self.test_df]:
            
            history = [] # 最后加入到 df 中
            
            uids, iids, labels = df['uid'].tolist(), df['iid'].tolist(), df['label'].tolist()
            
            for i, uid in enumerate(uids):
                iid, label = iids[i], labels[i]
                
                if uid not in his_dict:
                    his_dict[uid] = []
                
                tmp_his = his_dict[uid] if max_his <= 0 else his_dict[uid][-max_his:]
#                 print(tmp_his)
                # 去除 [] 第一个元素是 ‘’， history中的元素是 str 类型
                history.append(str(tmp_his).replace(' ', '')[1:-1])
                
                if label <= 0:
                    his_dict[uid].append(-iid)
                else:
                    his_dict[uid].append(iid)
            df['history'] = history
    def load_info(self):
        max_dict, min_dict = {}, {}
        for df in [self.train_df, self.validation_df, self.test_df]:
            for c in df.columns:
                if c not in max_dict:
                    max_dict[c] = df[c].max()
                else:
                    max_dict[c] = max(df[c].max(), max_dict[c])
                
                if c not in min_dict:
                    min_dict[c] = df[c].min()
                else:
                    min_dict[c] = min(df[c].min(), min_dict[c])
                
        self.column_max = max_dict
        self.column_min = min_dict
        
        self.user_num, self.item_num = 0, 0
        if 'uid' in self.column_max:
            self.user_num = self.column_max['uid'] + 1
        if 'iid' in self.column_max:
            self.item_num = self.column_max['iid'] + 1
        
                    
    def drop_neg(self):
        self.train_df = self.train_df[self.train_df['label'] > 0].reset_index(drop=True)
        self.validation_df = self.validation_df[self.validation_df['label'] > 0].reset_index(drop=True)
        self.test_df = self.test_df[self.test_df['label'] > 0].reset_index(drop=True)

In [510]:
dataloader = DataLoader('./dataset', 'ml100k01-1-5', 'label')
# dataloader.append_his()
# dataloader.drop_neg()

In [511]:
dataloader.train_df['label'].value_counts()

1    53514
0    41266
Name: label, dtype: int64

In [361]:
class DataProcessor(object):
    data_columns = ['uid', 'iid', 'x']
    info_columns = ['sample_id', 'time']
    
    def __init__(self, data_loader):
        self.data_loader = data_loader
#         self.train_sample_n = train_sample_n
#         self.test_sample_n = test_sample_n
        self.train_data = None
        
        self.train_history_pos = defaultdict(set)
        for uid in data_loader.train_user_pos.keys():
            self.train_history_pos[uid] = set(data_loader.train_user_pos[uid])
            
        self.validation_history_pos = defaultdict(set)
        for uid in data_loader.validation_user_pos.keys():
            self.validation_history_pos[uid] = set(data_loader.validation_user_pos[uid])

        self.test_history_pos = defaultdict(set)
        for uid in data_loader.test_user_pos.keys():
            self.test_history_pos[uid] = set(data_loader.test_user_pos[uid])

        self.train_history_neg = defaultdict(set)
        for uid in data_loader.train_user_neg.keys():
            self.train_history_neg[uid] = set(data_loader.train_user_neg[uid])

        self.validation_history_neg = defaultdict(set)
        for uid in data_loader.validation_user_neg.keys():
            self.validation_history_neg[uid] = set(data_loader.validation_user_neg[uid])

        self.test_history_neg = defaultdict(set)
        for uid in data_loader.test_user_neg.keys():
            self.test_history_neg[uid] = set(data_loader.test_user_neg[uid])
        
    def get_train_data(self, epoch):
        if self.train_data is None:
            self.generate_x_samples(self.data_loader.train_df)
#             self.train_data = self.format_data_dict(self.data_loader.train_df)
#             self.train_data['sample_id'] = np.arange(0, len(self.train_data['y']))
        
        if epoch >= 0:
            np.random.seed(10) # 保证所有的column shuffle的顺序一样
            rng_state = np.random.get_state()
            for d in self.train_data:
                np.random.set_state(rng_state)
                np.random.shuffle(self.train_data[d])
                
#         return self.train_data
    
    # 每个 uid iid 产生 5 个 正样本 和 5 个负样本
    def generate_x_samples(self, df, x_number=5, stage='train'):
        x_samples_list = []
        
        if stage == 'train':
            user_pos = self.train_history_pos
            user_neg = self.train_history_neg
        elif stage == 'validation':
            user_pos = self.validation_history_pos
            user_neg = self.validation_history_neg
        elif stage == 'test':
            user_pos = self.test_history_pos
            user_neg = self.test_history_neg
            
        for i, uid in enumerate(df['uid'].values):

            iid = df['iid'].values[i]
            
            # get x_number positive samples
            pos_iids = user_pos[uid]
            if iid in pos_iids:
                pos_iids.remove(iid)
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)
                pos_iids.add(iid)
            else:
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)
                pos_iids.add(iid)
            
            # get x_number negative samples
            neg_iids = user_neg[uid]
            if iid in neg_iids:
                neg_iids.remove(iid)
                if x_number < len(neg_iids):
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=False)
                else:
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=True)
                neg_iids.add(iid)
            else:
                if x_number < len(neg_iids):
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=False)
                else:
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=True)
                neg_iids.add(iid)

            x_samples = np.concatenate([pos_samples, neg_samples]).tolist()
            x_samples_list.append(x_samples)

        df['x'] = x_samples_list
        
        return df
    
    def format_data_dict(self, df):
        
#         df = df[df['history'].apply(lambda x: len(x) > 0)]
        
        data_loader = self.data_loader
        data = {}
        
        if 'uid' in df:
            data['uid'] = df['uid'].values
        if 'iid' in df:
            data['iid'] = df['iid'].values
        if 'time' in df:
            data['time'] = df['time'].values
        if 'label' in df:
            data['y'] = np.array(df['label'], dtype=np.float32)
        

            
    
    # data 是 format_data_dict 处理后的字典
    def prepare_batches(self, data, batch_size, train):
        num_example = len(data['y'])
        total_batch = int((num_example + batch_size - 1)/batch_size)
        neg_data = None
        if train:
            neg_data = self.generate_neg_data(data, 
                                              self.data_loader.train_df, 
                                              sample_n=self.train_sample_n,
                                              train=True)
        batches = []
        
        for batch in tqdm(range(total_batch), leave=False, ncols=100, mininterval=1, desc='Prepare Batches'):
            batches.append(self.get_feed_dict(data=data, batch_start=batch * batch_size, batch_size= batch_size,
                                             train=train, neg_data=neg_data))
            
        return batches
    
    def get_feed_dict(self, data, batch_start, batch_size, train, neg_data=None):
        
        total_data_num = len(data['sample_id'])
        batch_end = min(len(data['uid']), batch_start + batch_size)
        real_batch_size = batch_end - batch_start
        total_batch_size = real_batch_size * (self.train_sample_n + 1)
        
        feed_dict = {'train': train, 'real_batch_size': real_batch_size, 'total_batch_size': total_batch_size}
        
        feed_dict['y'] = self.numpy_to_torch(data['y'][batch_start:batch_start + real_batch_size])
        for c in ['uid', 'iid', 'x', 'sample_id', 'time', 'history']:
            d = data[c][batch_start: batch_start + real_batch_size]
            if train:
                neg_d = np.concatenate([neg_data[c][total_data_num * i + batch_start: total_data_num * i + batch_start + real_batch_size]
                                       for i in range(self.train_sample_n)])
        
                d = np.concatenate([d, neg_d])
            feed_dict[c] = d
        
        for c in ['uid', 'iid', 'x', 'history']:
            feed_dict[c] = self.numpy_to_torch(feed_dict[c])
        
        return feed_dict
        
    def numpy_to_torch(self, d, gpu=False, requires_grad=True):
        t = torch.from_numpy(d)
        if d.dtype is np.float:
            t.requires_grad = requires_grad
        if gpu and torch.cuda.device_count() > 0:
            t = t.cuda()
        return t
        
        # 确保每个feature值都对应一个独特的 value
#         base = 0
#         for feature in ui_id.columns:
#             ui_id[feature] = ui_id[feature].apply(lambda x: x + base)
#             base += int(data_loader.column_max[feature] + 1)
        
#         data['x'] = ui_id.values.astype(int)
        
        # 把字符串转化成 list
#         data['history'] = df['history'].apply(lambda x: eval('[' + x + ']'))
        return data
    
#     def generate_neg_data(self, data, feature_df, sample_n, train):
#         inter_df = pd.DataFrame()
#         for c in ['uid', 'iid', 'y', 'time']:
#             if c in data:
#                 inter_df[c] = data[c]
        
#         neg_df = self.generate_neg_df(inter_df=inter_df,
#                                      feature_df=feature_df,
#                                      sample_n=sample_n, train=train)
        
#         neg_data = self.format_data_dict(neg_df)
        
# #         neg_data['sample_id'] = np.arange(0, len(neg_data['y'])) + len(data['sample_id'])
        
#         return neg_data
                                          
    
#     def generate_neg_df(self, inter_df, feature_df, sample_n, train):
#         other_columns = [c for c in inter_df.columns if c not in ['uid', 'y']]
        
#         neg_df = self._sample_neg_from_uid_list(uids=inter_df['uid'].tolist(),
#                                                labels=inter_df['y'].tolist(),
#                                                sample_n=sample_n,
#                                                train=train,
#                                                 # other_infos : {'iid', [], 'time': []}
#                                                other_infos=inter_df[other_columns].to_dict('list'))
#         # neg_df 和 train_df 具有相同的 history
#         neg_df = pd.merge(neg_df, feature_df, on=['uid'] + other_columns, how='left')
        
#         neg_df = neg_df.drop(columns=['iid'])
#         neg_df = neg_df.rename(columns={'iid_neg': 'iid'})
        
#         neg_df = neg_df[feature_df.columns]
#         neg_df['label'] = 0
#         return neg_df
    
#     def _sample_neg_from_uid_list(self, uids, labels, sample_n, train, other_infos):
        
#         # 负样本集合，一个uid对应一个 unknown_iid_list
#         iid_list = []
        
#         other_info_list = {}
#         for info in other_infos:
#             other_info_list[info] = []
        
#         item_num = self.data_loader.item_num
#         for index, uid in enumerate(uids):
#             if labels[index] > 0:
#                 train_history = self.train_history_pos
#                 validation_hisotry, test_history = self.validation_history_pos, self.test_history_pos
#                 known_train = self.train_history_neg
#             else:
#                 assert train #?
#                 train_history = self.train_history_neg
#                 validation_hisotry, test_history = self.validation_history_neg, self.test_history_neg
#                 known_train = self.train_history_pos
            
#             if train:
#                 inter_iids = train_history[uid]
#             else:
#                 inter_iids = train_history[uid] | validation_hisotry[uid] | test_history[uid]
            
#             remain_iids_num = item_num - len(inter_iids)
            
#             sampled = set()
#             unknown_iid_list = []
#             for i in range(sample_n):
#                 iid = np.random.randint(1, self, data_loader.item_num)
#                 while iid in inter_iids or iid in sampled:
#                     iid = np.random.randint(1, self.data_loader.item_num)
#                 unknown_iid_list.append(iid)
#                 sampled.add(iid)
            
#             iid_list.append(unknown_iid_list)
            
#         all_uid_list, all_iid_list = [], []
#         for i in range(sample_n):
#             for index, uid in enumerate(uids):
#                 all_uid_list.append(uid)
#                 all_iid_list.append(iid_list[index][i])
            
#             for info in other_infos:
#                 other_info_list[info].append(other_infos[info][index])
       
#         neg_df = pd.DataFrame(data=list(zip(all_uid_list, all_iid_list)), columns=['uid', 'iid_neg'])
#         for info in other_infos:
#             neg_df[info] = other_info_list[info]
#         return neg_df
            
                
        

In [374]:
class DataProcessor(object):
    data_columns = ['uid', 'iid', 'x']
    info_columns = ['sample_id', 'time']
    
    def __init__(self, data_loader):
        self.data_loader = data_loader
#         self.train_sample_n = train_sample_n
#         self.test_sample_n = test_sample_n
        self.train_data = None
        
        self.train_history_pos = defaultdict(set)
        for uid in data_loader.train_user_pos.keys():
            self.train_history_pos[uid] = set(data_loader.train_user_pos[uid])
            
        self.validation_history_pos = defaultdict(set)
        for uid in data_loader.validation_user_pos.keys():
            self.validation_history_pos[uid] = set(data_loader.validation_user_pos[uid])

        self.test_history_pos = defaultdict(set)
        for uid in data_loader.test_user_pos.keys():
            self.test_history_pos[uid] = set(data_loader.test_user_pos[uid])

        self.train_history_neg = defaultdict(set)
        for uid in data_loader.train_user_neg.keys():
            self.train_history_neg[uid] = set(data_loader.train_user_neg[uid])

        self.validation_history_neg = defaultdict(set)
        for uid in data_loader.validation_user_neg.keys():
            self.validation_history_neg[uid] = set(data_loader.validation_user_neg[uid])

        self.test_history_neg = defaultdict(set)
        for uid in data_loader.test_user_neg.keys():
            self.test_history_neg[uid] = set(data_loader.test_user_neg[uid])
        
    def get_train_data(self, epoch):
        if self.train_data is None:
            self.generate_x_samples(self.data_loader.train_df)
#             self.train_data = self.format_data_dict(self.data_loader.train_df)
#             self.train_data['sample_id'] = np.arange(0, len(self.train_data['y']))
        
        if epoch >= 0:
            np.random.seed(10) # 保证所有的column shuffle的顺序一样
            rng_state = np.random.get_state()
            for d in self.train_data:
                np.random.set_state(rng_state)
                np.random.shuffle(self.train_data[d])
                
#         return self.train_data
    
    # 每个 uid iid 产生 5 个 正样本 和 5 个负样本
    def generate_x_samples(self, df, x_number=5, stage='train'):
        x_samples_list = []
        
        if stage == 'train':
            user_pos = self.train_history_pos
            user_neg = self.train_history_neg
        elif stage == 'validation':
            user_pos = self.validation_history_pos
            user_neg = self.validation_history_neg
        elif stage == 'test':
            user_pos = self.test_history_pos
            user_neg = self.test_history_neg
            
        for i, uid in enumerate(df['uid'].values):

            iid = df['iid'].values[i]
            
            # get x_number positive samples
            pos_iids = user_pos[uid]
            if iid in pos_iids:
                pos_iids.remove(iid)
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)
                pos_iids.add(iid)
            else:
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)
            
            # get x_number negative samples (true negative and non-seen movies)
            neg_iids = user_neg[uid]
            if iid in neg_iids:
                neg_iids.remove(iid)
                if x_number < len(neg_iids):
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=False)
                else:
                    
                neg_iids.add(iid)
            else:
                if x_number < len(neg_iids):
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=False)
                else:
                    print(uid)
                    neg_samples = np.random.choice(list(neg_iids), x_number, replace=True)

            x_samples = np.concatenate([pos_samples, neg_samples]).tolist()
            x_samples_list.append(x_samples)

        df['x'] = x_samples_list
        
        return df
    
    def format_data_dict(self, df):
        
#         df = df[df['history'].apply(lambda x: len(x) > 0)]
        
        data_loader = self.data_loader
        data = {}
        
        if 'uid' in df:
            data['uid'] = df['uid'].values
        if 'iid' in df:
            data['iid'] = df['iid'].values
        if 'time' in df:
            data['time'] = df['time'].values
        if 'label' in df:
            data['y'] = np.array(df['label'], dtype=np.float32)
    
    def prepare_batches():
        

In [375]:
dataprocessor = DataProcessor(dataloader)

In [376]:
dataprocessor.get_train_data(-1)
# dataprocessor.prepare_batches(train_data, 128, 1)

941
941
941
941
941
941
941
941
941
941
941
941
941
941
941
941
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
118
477


ValueError: 'a' cannot be empty unless no samples are taken

In [378]:
train_history_pos = defaultdict(set)
for uid in dataloader.train_user_neg.keys():
    train_history_pos[uid] = set(dataloader.train_user_neg[uid])

In [379]:
train_history_pos[477]

set()

In [381]:
dataloader.train_df[dataloader.train_df['uid'] == 477]

Unnamed: 0,uid,iid,label,time
7863,477,237,1,875940451
7864,477,294,1,875940693
7865,477,25,1,875940755
7866,477,756,1,875940755
7867,477,369,1,875940836
7868,477,280,1,875941022
7869,477,88,1,875941085
7870,477,724,1,875941086
7871,477,732,1,875941111
7872,477,794,1,875941111


In [382]:
dataloader.validation_df[dataloader.validation_df['uid'] == 477]

Unnamed: 0,uid,iid,label,time
113,477,546,1,875941972


In [384]:
dataloader.test_df[dataloader.test_df['uid'] == 477]

Unnamed: 0,uid,iid,label,time
114,477,846,1,875942042


In [174]:
import numpy as np
df = neg_df
neg_data = {}
# ['uid', 'iid']
out_columns = []

if 'uid' in df:
    out_columns.append('uid')
    neg_data['uid'] = df['uid'].values
if 'iid' in df:
    out_columns.append('iid')
    neg_data['iid'] = df['iid'].values
if 'time' in df:
    neg_data['time'] = df['time'].values

if dataloader.label in df.columns:
    neg_data['y'] = np.array(df[dataloader.label], dtype=np.float32)
else:
    neg_data['y'] = np.zeros(len(df), dtype=np.float32)

ui_id = df[out_columns]

base = 0
for feature in ui_id.columns:
    ui_id[feature] = ui_id[feature].apply(lambda x: x + base)
    base += int(dataloader.column_max[feature] + 1)

neg_data['x'] = ui_id.values.astype(int) # ui_id.values 把 dataframe 转换成了 list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [117]:
data

{'uid': array([259, 259, 259, ..., 729, 729, 729]),
 'iid': array([255, 286, 298, ..., 300, 333, 689]),
 'time': array([874724710, 874724727, 874724754, ..., 893286638, 893286638,
        893286638]),
 'y': array([1., 1., 1., ..., 1., 1., 1.], dtype=float32),
 'x': array([[ 259, 1199],
        [ 259, 1230],
        [ 259, 1242],
        ...,
        [ 729, 1244],
        [ 729, 1277],
        [ 729, 1633]])}

In [118]:
inter_df = pd.DataFrame()
for c in ['uid', 'iid', 'y', 'time']:
    if c in data:
        inter_df[c] = data[c]
    else:
        assert c == 'time'

# neg_df = self.generate_neg_df(inter_df=inter_df,
#                              feature_df=feature_df,
#                              sample_n=sample_n, train=train)

In [130]:
other_columns = [c for c in inter_df.columns if c not in ['uid', 'y']]
other_infos = inter_df[other_columns].to_dict('list')

In [131]:
other_info.keys()

dict_keys(['iid', 'time'])

In [None]:
uids, labels, sample_n, train, other_infos

In [134]:
from collections import defaultdict
train_history_pos = defaultdict(set)
for uid in dataloader.train_user_pos.keys():
    train_history_pos[uid] = set(dataloader.train_user_pos[uid])

validation_history_pos = defaultdict(set)
for uid in dataloader.validation_user_pos.keys():
    validation_history_pos[uid] = set(dataloader.validation_user_pos[uid])

test_history_pos = defaultdict(set)
for uid in dataloader.test_user_pos.keys():
    test_history_pos[uid] = set(dataloader.test_user_pos[uid])

train_history_neg = defaultdict(set)
for uid in dataloader.train_user_neg.keys():
    train_history_neg[uid] = set(dataloader.train_user_neg[uid])

validation_history_neg = defaultdict(set)
for uid in dataloader.validation_user_neg.keys():
    validation_history_neg[uid] = set(dataloader.validation_user_neg[uid])

test_history_neg = defaultdict(set)
for uid in dataloader.test_user_neg.keys():
    test_history_neg[uid] = set(dataloader.test_user_neg[uid])

In [162]:
uids = inter_df['uid'].tolist()
labels = inter_df['y'].tolist()
sample_n = 1
train = True


iid_list = []
other_info_list = {}
for info in other_infos:
    other_info_list[info] = []

item_num = dataloader.item_num
for index, uid in enumerate(uids):
    if labels[index] > 0:
        train_history = train_history_pos
        validation_hisotry, test_history = validation_history_pos, test_history_pos

    if train:
        inter_iids = train_history[uid]
    else:
        inter_iids = train_history[uid] | validation_hisotry[uid] | test_history[uid]

    remain_iids_num = item_num - len(inter_iids)

    sampled = set()
    unknown_iid_list = []
    for i in range(sample_n):
        iid = np.random.randint(1, dataloader.item_num)
        while iid in inter_iids or iid in sampled:
            iid = np.random.randint(1, dataloader.item_num)
        unknown_iid_list.append(iid)
        sampled.add(iid)

    iid_list.append(unknown_iid_list)

all_uid_list, all_iid_list = [], []
for i in range(sample_n):
    for index, uid in enumerate(uids):
        all_uid_list.append(uid)
        all_iid_list.append(iid_list[index][i])

        for info in other_infos:
            other_info_list[info].append(other_infos[info][index])

neg_df = pd.DataFrame(data=list(zip(all_uid_list, all_iid_list)), columns=['uid', 'iid_neg'])
for info in other_infos:
    neg_df[info] = other_info_list[info]
    

In [169]:
neg_df = neg_df.drop(columns=['iid'])
neg_df['label'] = 0
neg_df = neg_df.rename(columns={'iid_neg': 'iid'})

Unnamed: 0,uid,iid,time,label
0,259,1101,874724710,0
1,259,361,874724727,0
2,259,391,874724754,0
3,259,458,874724781,0
4,259,1049,874724843,0
...,...,...,...,...
53509,729,471,893286637,0
53510,729,646,893286637,0
53511,729,712,893286638,0
53512,729,23,893286638,0


In [189]:
data

{'uid': array([259, 259, 259, ..., 729, 729, 729]),
 'iid': array([1101,  361,  391, ...,  712,   23,  164]),
 'time': array([874724710, 874724727, 874724754, ..., 893286638, 893286638,
        893286638]),
 'y': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'x': array([[ 259, 1199],
        [ 259, 1230],
        [ 259, 1242],
        ...,
        [ 729, 1244],
        [ 729, 1277],
        [ 729, 1633]]),
 'sample_id': array([    0,     1,     2, ..., 53511, 53512, 53513])}

In [196]:
data['y'] = np.ones(len(data['uid']))

In [177]:
data['sample_id'] = np.arange(0, len(data['y']))

In [178]:
neg_data['sample_id'] = np.arange(0, len(neg_data['y'])) + len(data['sample_id'])

In [183]:
import torch
def numpy_to_torch(d, gpu=False, requires_grad=True):
    t = torch.from_numpy(d)
    if d.dtype is np.float:
        t.requires_grad = requires_grad
    if gpu and torch.cuda.device_count() > 0:
        t = t.cuda()
    return t

In [198]:
batch_start = 0
batch_size = 128
total_data_num = len(data['sample_id'])
batch_end = min(len(data['uid']), batch_start + batch_size)
real_batch_size = batch_end - batch_start
total_batch_size = real_batch_size * (1 + 1)

feed_dict = {'train': train, 'real_batch_size': real_batch_size, 'total_batch_size': total_batch_size}

feed_dict['y'] = numpy_to_torch(data['y'][batch_start:batch_start + real_batch_size])
for c in ['uid', 'iid', 'x', 'sample_id', 'time', 'history']:
    d = data[c][batch_start: batch_start + real_batch_size]
    
    if train:
        neg_d = np.concatenate([neg_data[c][total_data_num * i + batch_start: total_data_num * i + batch_start + real_batch_size]
                               for i in range(1)])

        d = np.concatenate([d, neg_d])
    feed_dict[c] = d

for c in ['uid', 'iid', 'x']:
    feed_dict[c] = numpy_to_torch(feed_dict[c])

In [199]:
len(feed_dict['uid'])

256

In [252]:
feed_dict.keys()

dict_keys(['train', 'real_batch_size', 'total_batch_size', 'y', 'uid', 'iid', 'x', 'sample_id', 'time'])

In [212]:
df = dataloader.train_df[dataloader.train_df['history'].apply(lambda x: len(x) > 0)]

array([list([255]), list([255, 286]), list([255, 286, 298]), ...,
       list([-879, -751, -294, -338, -901, -683, -894, 354, 322, 362]),
       list([-751, -294, -338, -901, -683, -894, 354, 322, 362, 300]),
       list([-294, -338, -901, -683, -894, 354, 322, 362, 300, 333])],
      dtype=object)

In [215]:
his = df['history'].apply(lambda x: eval('[' + x + ']'))

In [None]:
torch.Tensor(vector1.size()[:-1]).uniform_(0, 1).bernoulli()

TypeError: 'int' object is not callable

In [223]:
from torch import tensor
tensor1 = tensor([[ 1.4718e-02, -3.7019e-03, -6.8673e-03, -5.5302e-03,
          4.1877e-03, -4.9621e-03],
        [-3.2222e-03, -1.2830e-02, -8.6446e-03, 7.7203e-03,
         -1.5384e-02, -5.6847e-03],
        [ 4.0225e-03,  1.1947e-02, -2.4000e-02, 6.9259e-03,
          2.3533e-02, -1.5013e-02]])

In [227]:
len(tensor1.size())

2

In [226]:
tensor1.size()

torch.Size([3, 6])

In [248]:
t1 = tensor([[1, 2, 2, 2], [4, 5, 5, 5]])
t1.shape

torch.Size([2, 4])

In [235]:
t2 = tensor([[1, 2, 3, 7], [4, 5, 6, 8]])
t2.shape

torch.Size([2, 4])

In [236]:
t1.expand_as(t2)

RuntimeError: The expanded size of the tensor (4) must match the existing size (3) at non-singleton dimension 1.  Target sizes: [2, 4].  Tensor sizes: [2, 3]

In [245]:
r12 = torch.Tensor(2).uniform_(0, 1).bernoulli().unsqueeze(-1)

In [250]:
r12 * t2 + (1 - r12) * t1

tensor([[1., 2., 3., 7.],
        [4., 5., 5., 5.]])

In [249]:
(1 - r12) * t1

tensor([[0., 0., 0., 0.],
        [4., 5., 5., 5.]])

In [251]:
torch.cat((t1, t2), dim=-1)

tensor([[1, 2, 2, 2, 1, 2, 3, 7],
        [4, 5, 5, 5, 4, 5, 6, 8]])

In [387]:
t1 = torch.Tensor([[1, 2, 3, 4]])

In [267]:
pos_neg = torch.Tensor([[1, -1, 1]])

In [273]:
(torch.Tensor([[0, 0, 0, 0]])).softmax(dim=1)

tensor([[0.2500, 0.2500, 0.2500, 0.2500]])

In [277]:
((t1 * pos_neg) * (t1 * pos_neg).softmax(dim=0)).sum(dim=0)

tensor([ 1., -2.,  3.])

In [278]:
(t1 * pos_neg) * (t1 * pos_neg).softmax(dim=0)

tensor([[ 1., -2.,  3.]])

In [393]:
t1.expand_as(t2)

tensor([[1., 2., 3., 4.],
        [1., 2., 3., 4.]])

In [497]:
t1

tensor([[[1., 2.],
         [3., 4.]],

        [[2., 3.],
         [1., 2.]]])

In [498]:
t2

tensor([[[1., 2.],
         [3., 4.]],

        [[4., 5.],
         [6., 7.]]])

In [495]:
t1 = torch.Tensor([[[1, 2], [3, 4]], [[2, 3], [1, 2]]])

In [502]:
import torch.nn.functional as F
result = F.cosine_similarity(t1, torch.from_numpy(np.zeros(t1.size())), dim=-1)

In [506]:
torch.from_numpy(np.zeros(t1.size()))

tensor([[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]], dtype=torch.float64)

In [507]:
t1

tensor([[[1., 2.],
         [3., 4.]],

        [[2., 3.],
         [1., 2.]]])

In [503]:
result.sum()

tensor(0., dtype=torch.float64)

In [494]:
t2[torch.randperm(t2.size()[0])]

tensor([[[4., 5.],
         [6., 7.]],

        [[1., 2.],
         [3., 4.]]])

In [488]:
t2.shape

torch.Size([2, 2, 2])

In [490]:
t2

tensor([[[1., 2.],
         [3., 4.]],

        [[4., 5.],
         [6., 7.]]])

In [489]:
t2.norm(dim=2)

tensor([[2.2361, 5.0000],
        [6.4031, 9.2195]])

In [491]:
t2.norm(dim=2).sum()

tensor(22.8587)

In [453]:
t1 = t1.view([-1, 1, 2])

In [461]:
t1 = t1.expand_as(t2)

In [466]:
t1.shape

torch.Size([2, 2])

In [478]:
t = (t1 / t1.sum(dim=-1).view([2, 1])).view([2, -1, 1])

In [479]:
t

tensor([[[0.3333],
         [0.6667]],

        [[0.1667],
         [0.8333]]])

In [480]:
t.shape

torch.Size([2, 2, 1])

In [484]:
t2

tensor([[[1., 2.],
         [3., 4.]],

        [[4., 5.],
         [6., 7.]]])

In [483]:
(t2 * t).sum(dim=1)

tensor([[[0.3333, 0.6667],
         [2.0000, 2.6667]],

        [[0.6667, 0.8333],
         [5.0000, 5.8333]]])

In [485]:
(t2 * t).sum(dim=1)

tensor([[2.3333, 3.3333],
        [5.6667, 6.6667]])

In [458]:
t1.shape

torch.Size([2, 1, 2])

In [455]:
t1

tensor([[[1., 2.]],

        [[1., 5.]]])

In [457]:
t2.shape

torch.Size([2, 2, 2])

In [459]:
t2

tensor([[[1., 2.],
         [3., 4.]],

        [[4., 5.],
         [6., 7.]]])

In [449]:
import torch.nn.functional as F

In [460]:
F.cosine_similarity(t1, t2, dim=-1)

tensor([[1.0000, 0.9839],
        [0.8882, 0.8721]])

In [464]:
F.cosine_similarity(t1, t2, dim=-1) + 1

tensor([[2.0000, 1.9839],
        [1.8882, 1.8721]])

In [447]:
t2.shape

torch.Size([2, 2, 2])

In [445]:
t2

tensor([[[1., 2.],
         [3., 4.]],

        [[4., 5.],
         [6., 7.]]])

In [425]:
inter = torch.nn.Linear(4, 2)

In [429]:
res = inter(t3)

In [430]:
res.view([2, -1])

tensor([[[0.3612, 0.2748],
         [0.8749, 0.9574]],

        [[1.4696, 1.6543],
         [1.9834, 2.3369]]], grad_fn=<AddBackward0>)

In [433]:
res = res.view([2, -1])

In [437]:
l2 = torch.norm(res, 1, -1)

In [444]:
l2.shape

torch.Size([2])

In [442]:
res.shape

torch.Size([2, 4])

In [None]:
res

In [443]:
l2

tensor([2.4683, 7.4442], grad_fn=<NormBackward1>)

In [441]:
res / l2.view([-1, 1])

tensor([[0.1463, 0.1113, 0.3545, 0.3879],
        [0.1974, 0.2222, 0.2664, 0.3139]], grad_fn=<DivBackward0>)

In [517]:
t1

tensor([[[1., 2.],
         [3., 4.]],

        [[2., 3.],
         [1., 2.]]])

In [513]:
t1.size()

torch.Size([2, 2, 2])

In [516]:
t2 = torch.Tensor([[[0.1], [0.2]], [[0.3], [0.4]]])
t2.size()

torch.Size([2, 2, 1])

In [518]:
t2

tensor([[[0.1000],
         [0.2000]],

        [[0.3000],
         [0.4000]]])

In [522]:
(t1 * t2).sum(dim=1)

tensor([[0.7000, 1.0000],
        [1.0000, 1.7000]])

In [524]:
(t1 * t2).sum(dim=1).size()

torch.Size([2, 2])

In [525]:
t1 * t2

tensor([[[0.1000, 0.2000],
         [0.6000, 0.8000]],

        [[0.6000, 0.9000],
         [0.4000, 0.8000]]])

In [426]:
t3 = torch.cat((t2, t1), dim=-1)

In [427]:
t3

tensor([[[1., 2., 1., 2.],
         [3., 4., 1., 2.]],

        [[4., 5., 1., 5.],
         [6., 7., 1., 5.]]])

In [394]:
t2 = torch.Tensor([[[1, 2], [3, 4]],
                  [[4, 5], [6, 7]]])

In [407]:
t2.shape

torch.Size([2, 2, 2])