/
datasets.py
125 lines (105 loc) · 4.42 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import torch
import copy
import torch.utils.data as data
from random import sample
def train_collate_fn(batch_dataset: list):
return_tuple = [[], [], []]
for example in batch_dataset:
return_tuple[0].extend([[e] for e in example[0]])
return_tuple[1].extend([[e] for e in example[1]])
return_tuple[2].extend([[e] for e in example[2]])
return_tuple[0] = torch.tensor(return_tuple[0], dtype=torch.long)
return_tuple[1] = torch.tensor(return_tuple[1], dtype=torch.long)
return_tuple[2] = torch.tensor(return_tuple[2], dtype=torch.long)
return_tuple = tuple(return_tuple)
return return_tuple
class MultiDataset(data.Dataset):
def __init__(self,
user_num,
item_num,
train_data_dict_pv=None,
train_data_dict_buy=None,
train_mat_pv=None,
train_mat_buy=None,
test_data_dict_pv=None,
test_data_dict_buy=None,
test_mat_pv=None,
test_mat_buy=None,
user_pos_dict_pv=None,
user_pos_dict_buy=None,
test_pv_or_buy="buy",
train_mode="pv",
is_pretrain=False):
super(MultiDataset, self).__init__()
self.test_pv_or_buy = test_pv_or_buy
self.train_mode = train_mode
self.is_pretrain = is_pretrain
self.user_num = user_num
self.item_num = item_num
self.train_data_dict_pv = train_data_dict_pv
self.train_mat_pv = train_mat_pv
self.train_data_dict_buy = train_data_dict_buy
self.train_mat_buy = train_mat_buy
self.test_data_dict_pv = test_data_dict_pv
self.test_mat_pv = test_mat_pv
self.test_data_dict_buy = test_data_dict_buy
self.test_mat_buy = test_mat_buy
self.rest_pv_dict = None
self.user_pos_dict_pv = user_pos_dict_pv
self.user_pos_dict_buy = user_pos_dict_buy
self.features = None
self.features_pv = None
self.features_buy = None
self.build_features()
def build_features(self):
# features 的格式统一为[uid, pv_iid, buy_iid]
self.features = [] # 取都为正例的example
self.features_pv = [] # 取pv所有正例,并随机抽取buy的正例
self.features_buy = [] # 取buy所有正例,并随机抽取pv的正例
train_data_dict_pv = copy.deepcopy(self.train_data_dict_pv)
train_data_dict_buy = copy.deepcopy(self.train_data_dict_buy)
if self.is_pretrain:
for uid, buy_list in train_data_dict_buy.items():
pv_list = train_data_dict_pv[uid]
for buy_iid in buy_list:
pv_iid = sample(pv_list, 1)[0]
feature = [uid, pv_iid, buy_iid]
self.features_buy.append(feature)
for uid, pv_list in train_data_dict_pv.items():
buy_list = train_data_dict_buy[uid]
for pv_iid in pv_list:
buy_iid = sample(buy_list, 1)[0]
feature = [uid, pv_iid, buy_iid]
self.features_pv.append(feature)
else:
for uid, buy_list in train_data_dict_buy.items():
pv_list = train_data_dict_pv[uid]
for buy_iid in buy_list:
feature = [uid, buy_iid, buy_iid]
if buy_iid in pv_list:
pv_list.remove(buy_iid)
self.features.append(feature)
self.rest_pv_dict = train_data_dict_pv
for uid, pv_list in train_data_dict_pv.items():
if len(pv_list) == 0:
continue
for pv_iid in pv_list:
feature = [uid, pv_iid, pv_iid]
self.features_pv.append(feature)
def __len__(self):
if self.train_mode == "both":
return len(self.features)
elif self.train_mode == "buy":
return len(self.features_buy)
else:
return len(self.features_pv)
def __getitem__(self, idx):
# features 的格式统一为[uid, pv_iid, buy_iid]
if self.train_mode == "both":
feature = self.features[idx]
elif self.train_mode == "pv":
feature = self.features_pv[idx]
else:
feature = self.features_buy[idx]
uid, pv, buy = feature
return [uid], [pv], [buy]