# 基于rating2inter.ipynb生成的5-core交互图，Train/Validation/Test data splitting
- Based on generated interactions, perform data splitting


In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('/home/bjf/bjf_projects/MMRec/data/pet')
os.getcwd()

'/home/bjf/bjf_projects/MMRec/data/pet'

## 直接加载现成的, Load interactions

In [3]:
rslt_file = 'pet.inter'
df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {df.shape}')
df[:4]

shape: (157836, 5)


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,0,3.0,1294790400,0
1,1,0,5.0,1379116800,0
2,2,0,4.0,1355875200,0
3,3,0,4.0,1305158400,0


In [4]:
import random
import numpy as np

In [5]:

df = df.sample(frac=1).reset_index(drop=True)# 对数据框进行随机打乱，并重置索引

df.sort_values(by=['userID'], inplace=True)# 按照userID进行排序
df[:20]

Unnamed: 0,userID,itemID,rating,timestamp,x_label
132239,0,7,2.0,1274054400,0
4062,0,3724,1.0,1332115200,0
60556,0,3897,3.0,1383264000,0
14748,0,0,3.0,1294790400,0
94721,0,5770,4.0,1397692800,1
9468,0,1461,2.0,1373760000,0
142521,0,6628,5.0,1373155200,0
116275,1,3220,5.0,1368144000,0
148864,1,6263,4.0,1367280000,0
115729,1,766,5.0,1366329600,0


In [6]:
uid_field, iid_field = 'userID', 'itemID'# 定义用户ID和项目ID字段的变量名

uid_freq = df.groupby(uid_field)[iid_field]# 按用户ID对数据框(df)进行分组，并计算每个用户的项目ID列表
u_i_dict = {}
for u, u_ls in uid_freq:
    u_i_dict[u] = list(u_ls)# 将分组后的用户ID和项目ID列表添加到字典中
u_i_dict# 返回用户ID及其对应的项目ID列表的字典

{0: [7, 3724, 3897, 0, 5770, 1461, 6628],
 1: [3220, 6263, 766, 3459, 7987, 1199, 1173, 2522, 0, 5571, 7150, 6651],
 2: [5955, 1366, 1722, 4370, 2186, 0, 227],
 3: [1832, 8174, 1199, 5914, 0, 6719, 3848, 704, 1366, 3952, 6728],
 4: [0,
  5061,
  7712,
  7932,
  3862,
  8110,
  4622,
  7333,
  4665,
  7403,
  5545,
  7344,
  8085,
  4993,
  6313,
  6978,
  4656],
 5: [1845, 2370, 4763, 2669, 1, 1433, 2436],
 6: [1, 5026, 2792, 2977, 1710],
 7: [4815,
  7112,
  4341,
  8150,
  1,
  2193,
  713,
  7389,
  3415,
  3217,
  2979,
  3071,
  4548,
  660],
 8: [433, 4972, 4279, 2925, 1],
 9: [938,
  2812,
  3801,
  5089,
  4445,
  8017,
  65,
  1,
  630,
  422,
  4530,
  2912,
  3772,
  4480,
  6868,
  3969,
  1049,
  4883],
 10: [2636, 2011, 5129, 2483, 3112, 2, 8412],
 11: [1722, 2228, 2, 617, 1134, 1857],
 12: [1427, 2961, 2, 2001, 1385],
 13: [7063, 7062, 2, 4559, 2374, 923],
 14: [885,
  7958,
  2219,
  3641,
  1681,
  6734,
  6997,
  3655,
  2288,
  1554,
  7466,
  4825,
  8161,
  7145,
 

In [7]:
new_label = []# 初始化一个新的列表，用于存储重新分配的标签
u_ids_sorted = sorted(u_i_dict.keys())# 对用户ID进行排序，以便统一处理顺序

for u in u_ids_sorted:# 遍历每个用户
    items = u_i_dict[u]# 获取当前用户对应的项目列表
    n_items = len(items)# 计算项目的数量
    if n_items < 10:# 如果项目的数量少于10个
        tmp_ls = [0] * (n_items - 2) + [1] + [2]# 分配大部分为训练（0），少量为验证（1），预留一个为测试（2）
    else:# 对于项目数量大于等于10个的情况，按照8:1:1的比例分配训练/验证/测试集
        val_test_len = int(n_items * 0.2)
        train_len = n_items - val_test_len
        val_len = val_test_len // 2
        test_len = val_test_len - val_len
        tmp_ls = [0] * train_len + [1] * val_len + [2] * test_len
    new_label.extend(tmp_ls)# 将临时列表扩展到新标签列表中

new_label[:100]

[0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [8]:
df['x_label'] = new_label
df[:20]

Unnamed: 0,userID,itemID,rating,timestamp,x_label
132239,0,7,2.0,1274054400,0
4062,0,3724,1.0,1332115200,0
60556,0,3897,3.0,1383264000,0
14748,0,0,3.0,1294790400,0
94721,0,5770,4.0,1397692800,0
9468,0,1461,2.0,1373760000,1
142521,0,6628,5.0,1373155200,2
116275,1,3220,5.0,1368144000,0
148864,1,6263,4.0,1367280000,0
115729,1,766,5.0,1366329600,0


In [9]:
rslt_file[:-6]

'pet'

In [10]:
'生成真正的music_instruments.inter文件'
new_labeled_file = rslt_file[:-6] + '_supplies.inter'# 根据处理结果，生成新的带标签的文件名
df.to_csv(os.path.join('./', new_labeled_file), sep='\t', index=False)# 将处理结果数据框保存到指定路径和文件名的CSV文件中
print('done!!!')

done!!!


## Reload

In [11]:
indexed_df = pd.read_csv(new_labeled_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:20]

shape: (157836, 5)


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,7,2.0,1274054400,0
1,0,3724,1.0,1332115200,0
2,0,3897,3.0,1383264000,0
3,0,0,3.0,1294790400,0
4,0,5770,4.0,1397692800,0
5,0,1461,2.0,1373760000,1
6,0,6628,5.0,1373155200,2
7,1,3220,5.0,1368144000,0
8,1,6263,4.0,1367280000,0
9,1,766,5.0,1366329600,0


In [12]:
u_id_str, i_id_str = 'userID', 'itemID'
u_uni = indexed_df[u_id_str].unique()
c_uni = indexed_df[i_id_str].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 19856
# of unique courses: 8510
min/max of unique learners: 0/19855
min/max of unique courses: 0/8509
