In [1]:
import pandas as pd
import random

In [2]:
data = pd.read_csv('ratings.csv')
data.drop(columns='timestamp',inplace=True)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [3]:
# user数量
print('User数量：',data.userId.unique().shape[0])
# movie数量
print('Movie数量：',data.movieId.unique().shape[0])

User数量： 7120
Movie数量： 14026


In [4]:
# 以字典记录每个用户对各个电影的评分
# {User1：{Movie1：rating1，Movie2：rating2, ...}, User2：{ }, ... }
user_movie_rating = dict() 
for i in range(data.shape[0]):
    userid = data.iloc[i,0]
    movieid = data.iloc[i,1]
    rating = data.iloc[i,2]
    user_movie_rating.setdefault(userid, {})
    user_movie_rating[userid].setdefault(movieid, rating)

In [5]:
def train_test_split(user_movie_rating, ratio=0.2, random_seed=666):
    train_data = dict()
    test_data = dict()
    random.seed(random_seed)
    for user in user_movie_rating.keys():
        for movie, rating in user_movie_rating[user].items():
            if random.random()<ratio:
                test_data.setdefault(user, {})
                test_data[user][movie] = rating
            else:
                train_data.setdefault(user, {})
                train_data[user][movie] = rating
    return train_data, test_data

In [6]:
def dic_to_csv(dic, mode='train'):
    '''
    dic: train_dic or test_dic
    mode: str, 'train' or 'test'
    '''
    ratings = []
    for user in dic.keys():
        for movie, rating in dic[user].items():
            temp = [user, movie, rating]
            ratings.append(temp)        
    df = pd.DataFrame(ratings, columns=data.columns)
    df.to_csv(mode + '.csv', header=False, index=False)      # 注意：转换libsvm格式时不需要表头和索引！！！
    return None

In [7]:
train_data, test_data = train_test_split(user_movie_rating, ratio=0.2, random_seed=666)
dic_to_csv(train_data, mode='train')
dic_to_csv(test_data, mode='test')