# 从ratings_Musical_Instruments.csv文件中提取U-I交互图, 5-core后重新编号
- Extracting U-I interactions and performing 5-core, re-indexing
- dataset located at: http://jmcauley.ucsd.edu/data/amazon/links.html, rating only file in "Small" subsets for experimentation

In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('/home/bjf/bjf_projects/MMRec/data/pet')# 切换到音乐数据目录
os.getcwd()# 获取当前工作目录，这里可以用来验证是否成功切换目录

'/home/bjf/bjf_projects/MMRec/data/pet'

## 先5-core过滤
## 5-core filtering

In [3]:
df = pd.read_csv('ratings_Pet_Supplies.csv', names=['userID', 'itemID', 'rating', 'timestamp'], header=None)# 读取CSV文件，包含用户ID、项目ID、评分和时间戳四列数据
print(f'shape: {df.shape}')# 打印数据框的形状，以了解数据的行数和列数
df[:5]

shape: (1235316, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,A3PG0KS1YE8MR4,615553605,5.0,1354838400
1,A363P047LR5XI6,615553605,4.0,1373932800
2,ABZ8CQXD42H4,615553605,1.0,1386028800
3,A3J8QW1MV1OP01,615583474,5.0,1347321600
4,A3ISA8Z0NB0ILH,615583474,5.0,1358899200


In [4]:
k_core = 5
learner_id, course_id, tmstmp_str = 'userID', 'itemID', 'timestamp'# 定义变量learner_id, course_id, tmstmp_str分别代表用户ID，课程ID和时间戳字符串

df.dropna(subset=[learner_id, course_id, tmstmp_str], inplace=True)# 移除数据框df中learner_id, course_id, tmstmp_str任一为空的行
df.drop_duplicates(subset=[learner_id, course_id, tmstmp_str], inplace=True)# 移除数据框df中的重复行，基于learner_id, course_id, tmstmp_str这三个字段进行判断
print(f'After dropped: {df.shape}')# 打印处理后的数据框df的形状，以查看数据清理后的效果
df[:3]

After dropped: (1235316, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,A3PG0KS1YE8MR4,615553605,5.0,1354838400
1,A363P047LR5XI6,615553605,4.0,1373932800
2,ABZ8CQXD42H4,615553605,1.0,1386028800


In [5]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 5, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):#根据交互次数获取非法ID集合。
    if field is None:# 如果指定字段为空，则直接返回空集合
        return set()
    if max_num is None and min_num is None:# 如果min_num和max_num都未指定，则返回空集合
        return set()

    max_num = max_num or np.inf# 未指定max_num时，默认设置为正无穷
    min_num = min_num or -1# 未指定min_num时，默认设置为-1

    ids = df[field].values# 提取指定字段的值
    inter_num = Counter(ids)# 统计每个ID的出现次数
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}# 根据交互次数筛选出非法ID集合
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):#过滤数据框(df)中的交互记录，确保每个用户和项目都至少有指定的最小交互次数(min_u_num和min_i_num)
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)



## k-core

In [6]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:2]

707298 illegal_ids_by_inter_num, field=userID
72026 illegal_ids_by_inter_num, field=itemID
1235316 dropped interactions
5413 illegal_ids_by_inter_num, field=userID
16382 illegal_ids_by_inter_num, field=itemID
249174 dropped interactions
6309 illegal_ids_by_inter_num, field=userID
835 illegal_ids_by_inter_num, field=itemID
197718 dropped interactions
581 illegal_ids_by_inter_num, field=userID
1274 illegal_ids_by_inter_num, field=itemID
172485 dropped interactions
1030 illegal_ids_by_inter_num, field=userID
147 illegal_ids_by_inter_num, field=itemID
165612 dropped interactions
120 illegal_ids_by_inter_num, field=userID
240 illegal_ids_by_inter_num, field=itemID
161017 dropped interactions
227 illegal_ids_by_inter_num, field=userID
31 illegal_ids_by_inter_num, field=itemID
159605 dropped interactions
33 illegal_ids_by_inter_num, field=userID
48 illegal_ids_by_inter_num, field=itemID
158579 dropped interactions
45 illegal_ids_by_inter_num, field=userID
8 illegal_ids_by_inter_num, field=ite

Unnamed: 0,userID,itemID,rating,timestamp
121,A14CK12J7C7JRK,1223000893,3.0,1294790400
134,A39QHP5WLON5HV,1223000893,5.0,1379116800


## Re-index

In [7]:
df.reset_index(drop=True, inplace=True)# 重置DataFrame的索引，将原有索引丢弃，新索引从0开始计数

In [8]:
'生成i_id_mapping.csv以及u_id_mapping.csv'
# 定义用户和项目ID映射文件路径
i_mapping_file = 'i_id_mapping.csv'
u_mapping_file = 'u_id_mapping.csv'
# 数据集划分比例，分别为训练集、验证集和测试集的比例
splitting = [0.8, 0.1, 0.1]
uid_field, iid_field = learner_id, course_id# 定义用户ID和项目ID的字段名称
# 获取数据集中唯一用户ID和项目ID
uni_users = pd.unique(df[uid_field])
uni_items = pd.unique(df[iid_field])

# start from 0# 为用户ID和项目ID创建从0开始的整数映射
u_id_map = {k: i for i, k in enumerate(uni_users)}
i_id_map = {k: i for i, k in enumerate(uni_items)}
# 使用映射将原始ID转换为从0开始的整数
df[uid_field] = df[uid_field].map(u_id_map)
df[iid_field] = df[iid_field].map(i_id_map)
# 将ID字段的类型转换为整数
df[uid_field] = df[uid_field].astype(int)
df[iid_field] = df[iid_field].astype(int)

# dump# 将ID映射保存到CSV文件
rslt_dir = './'
# 将映射表转换为DataFrame并保存
u_df = pd.DataFrame(list(u_id_map.items()), columns=['user_id', 'userID'])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', 'itemID'])
# 使用os.path.join确保文件路径格式兼容不同操作系统
u_df.to_csv(os.path.join(rslt_dir, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(rslt_dir, i_mapping_file), sep='\t', index=False)
print(f'mapping dumped...')# 打印映射保存完成的消息

mapping dumped...


In [9]:

# =========2. splitting
print(f'splitting ...')
tot_ratio = sum(splitting)# 计算比例总和
# remove 0.0 in ratios
ratios = [i for i in splitting if i > .0]# 去除比例列表中的0.0值
ratios = [_ / tot_ratio for _ in ratios]# 计算累积比例，除去最后一个值
split_ratios = np.cumsum(ratios)[:-1]

#df[tmstmp_str] = df[tmstmp_str].map(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
split_ratios# 输出分割比例

splitting ...


array([0.8, 0.9])

In [10]:
'生成music.inter文件'
ts_id = 'timestamp'# 定义时间戳字段
# 计算分割时间戳
split_timestamps = list(np.quantile(df[ts_id], split_ratios))
# get df training dataset unique users/items
df_train = df.loc[df[ts_id] < split_timestamps[0]].copy()# 根据时间戳筛选训练数据集
df_val = df.loc[(split_timestamps[0] <= df[ts_id]) & (df[ts_id] < split_timestamps[1])].copy()# 根据时间戳筛选验证数据集
df_test = df.loc[(split_timestamps[1] <= df[ts_id])].copy()# 根据时间戳筛选测试数据集

x_label, rslt_file = 'x_label', 'pet.inter'# 定义x轴标签和结果文件名
# 为训练、验证和测试数据集设置x轴标签
df_train[x_label] = 0
df_val[x_label] = 1
df_test[x_label] = 2
temp_df = pd.concat([df_train, df_val, df_test])# 合并三个数据集
temp_df = temp_df[[learner_id, course_id, 'rating', ts_id, x_label]]# 选择所需的列
print(f'columns: {temp_df.columns}')# 打印列信息

temp_df.columns = [learner_id, course_id, 'rating', ts_id, x_label]# 重命名列

temp_df.to_csv(os.path.join(rslt_dir, rslt_file), sep='\t', index=False)# 保存到CSV文件
temp_df[:5]
#print('done!')

columns: Index(['userID', 'itemID', 'rating', 'timestamp', 'x_label'], dtype='object')


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,0,3.0,1294790400,0
1,1,0,5.0,1379116800,0
2,2,0,4.0,1355875200,0
3,3,0,4.0,1305158400,0
4,4,0,3.0,1330905600,0


## Reload

In [11]:
indexed_df = pd.read_csv(rslt_file, sep='\t')# 从CSV文件中读取数据，使用制表符作为分隔符
print(f'shape: {indexed_df.shape}')# 打印数据框的形状，即行数和列数
indexed_df[:4]

shape: (157836, 5)


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,0,3.0,1294790400,0
1,1,0,5.0,1379116800,0
2,2,0,4.0,1355875200,0
3,3,0,4.0,1305158400,0


In [12]:
u_uni = indexed_df[learner_id].unique()
c_uni = indexed_df[course_id].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 19856
# of unique courses: 8510
min/max of unique learners: 0/19855
min/max of unique courses: 0/8509
