In [11]:
import pandas as pd
import torch
import json

from tqdm import tqdm
from typing import Tuple,List, Dict

In [6]:
df = pd.read_csv('../data/train_dataset.csv')
print('共{}个用户，{}本图书，{}条记录'.format(max(df['user_id'])+1, max(df['item_id'])+1, len(df)))

共53424个用户，10000本图书，5869631条记录


In [7]:
df

Unnamed: 0,user_id,item_id
0,0,257
1,0,267
2,0,5555
3,0,3637
4,0,1795
...,...,...
5869626,49801,4655
5869627,49801,5091
5869628,49801,5294
5869629,49801,5608


In [8]:
# 定义窗口大小和填充值
window_size = 50
fill_value = 10000
min_sample = 10     # 不足窗口大小的数据至少的划分数（不包括验证和测试部分）

def full_windows(df: pd.DataFrame, window_size: int, fill_value: int, 
                 min_sample: int=10) -> Tuple[List[Dict], List[Dict], List[Dict]]:
    r"""划分窗口数据集, 同时划分出训练集、验证集和测试集"""
    # 获取唯一用户ID
    user_ids = df['user_id'].unique()
    user_ids.sort()
    windows = []
    valids = []
    tests = []

    # PS：由于填充的存在，这里其实还可以生成更多/更少的样本
    for user_id in tqdm(user_ids):
        # 获取当前用户的所有交互
        user_interactions = df[df['user_id'] == user_id]['item_id'].tolist()
        n = len(user_interactions)
    
        # 如果交互数不足最小划分数
        if n <= min_sample + 1:
            raise RuntimeError(f"min_sample should be set more largily than {n}!")
        
        # 补全不足的交互
        if n < window_size + min_sample + 1:
            user_interactions = [fill_value] * (window_size+min_sample+1-n) + user_interactions
            masks = [0] * (window_size+min_sample+1-n) + [1] * n
            n = window_size + min_sample + 1
        else:
            masks = [1] * n
            
       # 采样训练集
        for i in range(n - window_size - 1):
            window = user_interactions[i: i+window_size]
            mask = masks[i: i+window_size]
            target = user_interactions[i+window_size]
            windows.append({'user_id': int(user_id), 'window': window, 'mask': mask, 'target': int(target)})
            
        # 采样验证集
        window = user_interactions[n-window_size-1: -1]
        mask = masks[n-window_size-1: -1]
        target = user_interactions[-1]
        valids.append({'user_id': int(user_id), 'window': window, 'mask': mask, 'target': int(target)})
            
        # 采样测试集
        window = user_interactions[n-window_size:]
        mask = masks[n-window_size:]
        tests.append({'user_id': int(user_id), 'window': window, 'mask': mask})
    
    return windows, valids, tests

# 生成训练集和测试集
windows, valids, tests = full_windows(df, window_size, fill_value, min_sample)

100%|██████████| 53424/53424 [07:23<00:00, 120.38it/s]


In [9]:
# 一个中间步骤，保存预处理后的数据（不建议保存，因为信息量太大）
# with open('../data/train_raw.json', 'w', encoding='utf-8') as fp:
#     json.dump(windows, fp, indent=2, ensure_ascii=False)
    
# with open('../data/valid_raw.json', 'w', encoding='utf-8') as fp:
#     json.dump(valids, fp, indent=2, ensure_ascii=False)
    
# with open('../data/test_raw.json', 'w', encoding='utf-8') as fp:
#     json.dump(tests, fp, indent=2, ensure_ascii=False)

In [15]:
def data_process(data: List[Dict]) -> torch.LongTensor:
    r"""数据集进一步处理, 得到tensor张量"""
    X = []
    y = []
    for da in tqdm(data):
        X.append(da['window'])
        y.append(da['target'])
    return torch.tensor(X).long(), torch.tensor(y).long()

def test_process(data: List[Dict]) -> torch.LongTensor:
    r"""数据集进一步处理, 得到tensor张量"""
    X = []
    for da in tqdm(data):
        X.append(da['window'])
    return torch.tensor(X).long()

X_train, y_train = data_process(windows)
X_valid, y_valid = data_process(valids)
X_test = test_process(tests)

100%|██████████| 3172593/3172593 [00:01<00:00, 3167471.34it/s]
100%|██████████| 53424/53424 [00:00<00:00, 1112870.61it/s]
100%|██████████| 53424/53424 [00:00<00:00, 1618523.58it/s]


In [21]:
X_train.shape

torch.Size([3172593, 50])

In [22]:
# 保存张量
torch.save(X_train, '../data/X_train.pt')
torch.save(y_train, '../data/y_train.pt')
torch.save(X_valid, '../data/X_valid.pt')
torch.save(y_valid, '../data/y_valid.pt')
torch.save(X_test, '../data/X_test.pt')