## Save train, valid, test datasets

In [14]:
from recbole.quick_start import load_data_and_model
import pandas as pd
import os

def save_interaction_for_training(inter_feat, dataset, save_path):
    df = {}
    for field in inter_feat.interaction.keys():
        if field == dataset.uid_field:
            df['user_id:token'] = dataset.id2token(field, inter_feat[field].numpy().tolist())
        elif field == dataset.iid_field:
            df['item_id:token'] = dataset.id2token(field, inter_feat[field].numpy().tolist())
        elif field == config['TIME_FIELD']:
            df['timestamp:float'] = inter_feat[field].numpy()
        elif field == config['LABEL_FIELD']:
            df['label:float'] = inter_feat[field].numpy()
        else:
            df[field] = inter_feat[field].numpy()
    pd.DataFrame(df).to_csv(save_path, sep='\t', index=False)

# 加载模型与数据
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='../checkpoint_saved/ml-1m/BPR-Jun-08-2025_14-26-19.pth'
)

# 获取 interaction 对象
train_inter = train_data.dataset.inter_feat
valid_inter = valid_data.dataset.inter_feat
test_inter  = test_data.dataset.inter_feat

# 保存路径
save_dir = 'split_datasets/ml-1m/'
os.makedirs(save_dir, exist_ok=True)

# 保存为 .inter 文件
save_interaction_for_training(train_inter, dataset, os.path.join(save_dir, 'ml-1m.train.inter'))
save_interaction_for_training(valid_inter, dataset, os.path.join(save_dir, 'ml-1m.valid.inter'))
save_interaction_for_training(test_inter, dataset, os.path.join(save_dir, 'ml-1m.test.inter'))
print("Train, valid, test datasets saved successfully.")

08 Jun 14:44    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 42
state = INFO
reproducibility = True
data_path = datasets/ml-1m
checkpoint_dir = ../checkpoint_saved/ml-1m/
show_progress = True
save_dataset = True
dataset_save_path = None
save_dataloaders = True
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 100
train_batch_size = 1024
learner = adam
learning_rate = 0.0005
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'order': 'RO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Recall', 'Precision', 'Hit', 'NDCG', 'ItemCoverage', 'AveragePopularity', 'GiniIndex', 'ShannonEntropy', 'TailPercentage']
topk = [10]
valid_metric = NDCG@10
valid_me

Train, valid, test datasets saved successfully.


### Group item with popularity based on train dataset

In [6]:
import numpy as np
import pandas as pd
import os

# 读取数据
user_df = pd.read_csv('atomic_datasets/ml-1m/ml-1m.user', sep='\t')
inter_df = pd.read_csv('split_datasets/ml-1m/ml-1m.train.inter', sep='\t')

# 统计用户交互数
user_inter_count = inter_df['user_id'].value_counts().reset_index()
user_inter_count.columns = ['user_id', 'count']

# 检查 user_df 列名是否为 'user_id:token'
if 'user_id:token' in user_df.columns:
    user_inter_count.rename(columns={'user_id': 'user_id:token'}, inplace=True)
    merge_key = 'user_id:token'
else:
    merge_key = 'user_id'

# 中位数阈值
threshold = user_inter_count['count'].median()

# 活跃度分组
user_inter_count['activity_group:token'] = np.where(
    user_inter_count['count'] >= threshold, 'H', 'L'
)

# 合并并填充空值
user_df = user_df.merge(
    user_inter_count[[merge_key, 'activity_group:token']],
    on=merge_key,
    how='left'
)
user_df['activity_group:token'] = user_df['activity_group:token'].fillna('L')

# 保存
save_path = 'split_datasets/ml-1m/ml-1m.user'
os.makedirs(os.path.dirname(save_path), exist_ok=True)
user_df.to_csv(save_path, sep='\t', index=False)

print(f"保存成功：{save_path}")

保存成功：split_datasets/ml-1m/ml-1m.user


In [4]:
import numpy as np
import pandas as pd
import os

# 读取数据
item_df = pd.read_csv('atomic_datasets/ml-1m/ml-1m.item', sep='\t')
inter_df = pd.read_csv('split_datasets/ml-1m/ml-1m.train.inter', sep='\t')

# 检查字段名
item_col = 'item_id:token' if 'item_id:token' in item_df.columns else 'item_id'
inter_col = 'item_id' if 'item_id' in inter_df.columns else 'item_id:token'

# 统计 item 被点击次数
item_inter_count = inter_df[inter_col].value_counts().reset_index()
item_inter_count.columns = [item_col, 'count']

# 排序
item_inter_count = item_inter_count.sort_values(by='count', ascending=False).reset_index(drop=True)

# 分组边界
n = len(item_inter_count)
head_cutoff = int(n * 0.2)
tail_cutoff = int(n * 0.8)

# 分配 popularity_group
popularity_group = ['H'] * head_cutoff + ['M'] * (tail_cutoff - head_cutoff) + ['L'] * (n - tail_cutoff)
item_inter_count['popularity_group:token'] = popularity_group

# 合并进 item_df
item_df = item_df.merge(item_inter_count[[item_col, 'popularity_group:token']], on=item_col, how='left')
item_df['popularity_group:token'] = item_df['popularity_group:token'].fillna('L')  # 无交互默认为冷门

# 保存新文件
save_path = 'split_datasets/ml-1m/ml-1m.item'
os.makedirs(os.path.dirname(save_path), exist_ok=True)
item_df.to_csv(save_path, sep='\t', index=False)

print(f"保存成功：{save_path}")

保存成功：split_datasets/ml-1m/ml-1m.item


In [18]:
def fix_header(file_path, new_header):
    import pandas as pd
    df = pd.read_csv(file_path, sep='\t')
    df.columns = new_header
    df.to_csv(file_path, sep='\t', index=False)

# 举例修改 interaction 文件
fix_header(
    'datasets/ml-1m/ml-1m.train.inter',
    ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']
)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/ml-1m/ml-1m.train.inter'