In [3]:
import pandas as pd

# 1. 读取 u.data 文件
data = pd.read_csv('ml-1m/ratings.dat', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# 2. 创建用户和物品的映射
user_mapping = {org_id: remap_id for remap_id, org_id in enumerate(data['user_id'].unique())}
item_mapping = {org_id: remap_id for remap_id, org_id in enumerate(data['item_id'].unique())}

# 3. 映射用户和物品 ID
data['user_remap_id'] = data['user_id'].map(user_mapping)
data['item_remap_id'] = data['item_id'].map(item_mapping)

# 4. 创建空的训练集和测试集
train_dict = {user: [] for user in user_mapping.values()}
test_dict = {user: [] for user in user_mapping.values()}

# 5. 按照用户进行分组，并划分训练集和测试集
for user, group in data.groupby('user_remap_id'):
    group = group.sort_values('timestamp')  # 按时间排序
    if len(group) > 1:
        # 最后一个交互作为测试集
        train_dict[user] = group.iloc[:-1]['item_remap_id'].tolist()
        test_dict[user] = group.iloc[-1:]['item_remap_id'].tolist()
    else:
        # 如果只有一个交互，全部作为训练集
        train_dict[user] = group['item_remap_id'].tolist()

# 6. 保存 train.txt 文件
with open('train.txt', 'w') as train_file:
    for user, items in train_dict.items():
        line = f"{user} " + " ".join(map(str, items))
        train_file.write(line + "\n")

# 7. 保存 test.txt 文件
with open('test.txt', 'w') as test_file:
    for user, items in test_dict.items():
        if items:  # 只有在存在测试集数据时写入
            line = f"{user} " + " ".join(map(str, items))
            test_file.write(line + "\n")

# 8. 保存用户和物品映射文件
user_list = pd.DataFrame(list(user_mapping.items()), columns=['org_id', 'remap_id'])
item_list = pd.DataFrame(list(item_mapping.items()), columns=['org_id', 'remap_id'])

user_list.to_csv('user_list.txt', sep='\t', index=False, header=False)
item_list.to_csv('item_list.txt', sep='\t', index=False, header=False)


In [1]:
def convert_data(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            numbers = line.strip().split()
            user_id = numbers[0]
            item_ids = numbers[1:]
            
            for item_id in item_ids:
                outfile.write(f"{user_id},{item_id}\n")


input_file = "ml-1m/train.txt"
output_file = "ml-1m/ml-1m.train"
convert_data(input_file, output_file)
print(f"转换完成。结果已保存到 {output_file}")

转换完成。结果已保存到 ml-1m/ml-100k.train


In [2]:
import pandas as pd

# 读取 test.txt 文件
input_file = 'ml-1m/test.txt'
output_file = 'ml-1m/ml-1m.test'

data = []
with open(input_file, 'r') as f:
    for line in f:
        items = line.strip().split()
        user = items[0]  # 获取用户编号
        for item in items[1:]:  # 获取物品编号
            data.append([user, item])  # 将用户与物品组合

# 将数据转换为 DataFrame
df = pd.DataFrame(data, columns=['user', 'item'])

# 将 DataFrame 导出为逗号分隔的文件
df.to_csv(output_file, sep=',', header=False, index=False)

print(f"文件已保存为 {output_file}")


文件已保存为 ml-1m/ml-1m.test


In [14]:
def extract_predictions(result_file, test_file, output_file):
    # Read the result.txt file into a dictionary of user ratings (as a list of floats)
    result_dict = {}
    with open(result_file, 'r') as rf:
        current_user_id = None
        current_ratings = []

        for line in rf:
            line = line.strip()  # Remove leading/trailing whitespace

            # Check if the line contains user ID and ratings start
            if '[' in line:
                if current_user_id is not None:  # Save the previous user if we were reading one
                    result_dict[current_user_id] = current_ratings

                # Split the line on the first '['
                user_id_part, ratings_part = line.split('[', 1)
                current_user_id = int(user_id_part.strip())
                current_ratings = [float(x) for x in ratings_part.strip().rstrip(']').split()]
            elif current_user_id is not None:  # Continue collecting ratings for the current user
                current_ratings.extend(float(x) for x in line.strip().rstrip(']').split())

            # Check if we reach the end of the ratings for the current user
            if ']' in line:
                result_dict[current_user_id] = current_ratings
                current_user_id = None
                current_ratings = []

        # Handle the last user if the file doesn't end with a closing bracket
        if current_user_id is not None:
            result_dict[current_user_id] = current_ratings

    # Debug: print the populated result_dict
    print("Populated result_dict:", result_dict)

    # Open the test.txt file and prediction.txt file
    with open(test_file, 'r') as tf, open(output_file, 'w') as pf:
        for line in tf:
            user_id, item_id = map(int, line.strip().split())
            print(f"Checking user_id: {user_id}, item_id: {item_id}")  # Debugging output
            
            # Check if the user exists in result_dict and extract the item rating
            if user_id in result_dict:
                if item_id < len(result_dict[user_id]):
                    item_rating = result_dict[user_id][item_id]
                    pf.write(f'{user_id} {item_id} {item_rating}\n')
                else:
                    pf.write(f'{user_id} {item_id} -\n')  # Item ID out of range
            else:
                pf.write(f'{user_id} {item_id} -\n')  # User is missing

# Define file paths
result_file = 'result.txt'
test_file = 'ml-100k/test.txt'
output_file = 'prediction.txt'

# Call the function
extract_predictions(result_file, test_file, output_file)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [1]:
import pandas as pd
import numpy as np

# 1. 读取 u.data 文件
data = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# 2. 创建用户和物品的映射
user_mapping = {org_id: remap_id for remap_id, org_id in enumerate(data['user_id'].unique())}
item_mapping = {org_id: remap_id for remap_id, org_id in enumerate(data['item_id'].unique())}

# 3. 映射用户和物品 ID
data['user_remap_id'] = data['user_id'].map(user_mapping)
data['item_remap_id'] = data['item_id'].map(item_mapping)

# 4. 创建空的训练集和测试集
train_dict = {user: [] for user in user_mapping.values()}
test_dict = {user: [] for user in user_mapping.values()}

# 5. 按照用户进行分组，并划分训练集和测试集（80% 训练集，20% 测试集）
for user, group in data.groupby('user_remap_id'):
    group = group.sort_values('timestamp')  # 按时间排序
    n_interactions = len(group)
    split_point = int(n_interactions * 0.8)  # 80% 的数据作为训练集
    
    if n_interactions > 1:
        train_items = group.iloc[:split_point]['item_remap_id'].tolist()
        test_items = group.iloc[split_point:]['item_remap_id'].tolist()
        train_dict[user] = train_items
        test_dict[user] = test_items
    else:
        # 如果只有一个交互，全部作为训练集
        train_dict[user] = group['item_remap_id'].tolist()

# 6. 保存 train.txt 文件
with open('train.txt', 'w') as train_file:
    for user, items in train_dict.items():
        line = f"{user} " + " ".join(map(str, items))
        train_file.write(line + "\n")

# 7. 保存 test.txt 文件
with open('test.txt', 'w') as test_file:
    for user, items in test_dict.items():
        if items:  # 只有在存在测试集数据时写入
            line = f"{user} " + " ".join(map(str, items))
            test_file.write(line + "\n")

# 8. 保存用户和物品映射文件
user_list = pd.DataFrame(list(user_mapping.items()), columns=['org_id', 'remap_id'])
item_list = pd.DataFrame(list(item_mapping.items()), columns=['org_id', 'remap_id'])

user_list.to_csv('user_list.txt', sep='\t', index=False, header=False)
item_list.to_csv('item_list.txt', sep='\t', index=False, header=False)


  data = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
