In [1]:
# 加载数据
import pandas as pd
import numpy as np
import config
import pickle as pkl

args = config.args_initialization()

movie_names = ['movie_id', 'movie_title', 'movie_type']
movie = pd.read_table(args.path + 'movies.dat', sep='::', header=None,
                      names=movie_names, engine='python', encoding="ISO-8859-1")
user_names = ['user_id', 'user_gender', 'user_age', 'user_job', 'zip']
user = pd.read_table(args.path + 'users.dat', sep='::', header=None,
                     names=user_names, engine='python')
rating_names = ['user_id', 'movie_id', 'rank', 'timestamp']
rating = pd.read_table(args.path + 'ratings.dat', sep='::',
                       header=None, names=rating_names, engine='python')

In [2]:
# 剔除无关列
user = user.drop(['zip'], axis=1)
rating = rating.drop(['timestamp'], axis=1)

In [3]:
users_raw_dict = {}
movies_raw_dict = {}
for i in range(len(user)):
    user_raw_dict = {'User_ID': user['user_id'][i], 'Gender': user['user_gender'][i], 'Age': user['user_age'][i],
                     'Job': user['user_job'][i]}
    users_raw_dict[user['user_id'][i]] = user_raw_dict

for j in range(len(movie)):
    movie_raw_dict = {'Movie_ID': movie['movie_id'][j], 'Movie_Title': movie['movie_title'][j],
                      'Movie_Type': movie['movie_type'][j]}
    movies_raw_dict[movie['movie_id'][j]] = movie_raw_dict

pkl.dump(users_raw_dict, open(args.path + 'users_raw.pkl', 'wb'))
pkl.dump(movies_raw_dict, open(args.path + 'movies_raw.pkl', 'wb'))

In [4]:
# User 相关数据处理
user['user_gender'] = user['user_gender'].apply(lambda x: [1, 0] if x == 'F' else [0, 1])


def convert_age_to_One_Hot(age):
    if age == 1:
        return [1, 0, 0, 0, 0, 0, 0]
    elif age == 18:
        return [0, 1, 0, 0, 0, 0, 0]
    elif age == 25:
        return [0, 0, 1, 0, 0, 0, 0]
    elif age == 35:
        return [0, 0, 0, 1, 0, 0, 0]
    elif age == 45:
        return [0, 0, 0, 0, 1, 0, 0]
    elif age == 50:
        return [0, 0, 0, 0, 0, 1, 0]
    else:
        return [0, 0, 0, 0, 0, 0, 1]


def convert_job_to_One_Hot(job):
    jobs = [0] * 21
    jobs[job] += 1
    return jobs


user['user_age'] = user['user_age'].apply(convert_age_to_One_Hot)
user['user_job'] = user['user_job'].apply(convert_job_to_One_Hot)

In [5]:
# Movie 相关数据处理
# 电影名称索引化
args.max_length = 16
movie_title_word2id = {'pad': 0}
for i in range(len(movie['movie_title'])):
    words = movie['movie_title'][i].split(' ')
    del words[-1]  # 去除年份
    movie_title_id = []
    for word in words:
        if word not in movie_title_word2id:
            movie_title_word2id[word] = len(movie_title_word2id)
        movie_title_id.append(movie_title_word2id[word])
    movie_title_id.extend([0] * (args.max_length - len(words)))  # 填充
    movie['movie_title'].loc[i] = movie_title_id
args.vocabulary_size = len(movie_title_word2id)
# 电影类型索引化
movie_types = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy',
               'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
               'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
               'Thriller', 'War', 'Western']
for i in range(len(movie['movie_type'])):
    types = movie['movie_type'][i].split('|')
    type_id = []
    for j in range(len(movie_types)):
        if movie_types[j] in types:
            type_id.append(1)
        else:
            type_id.append(0)
    movie['movie_type'].loc[i] = type_id

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [6]:
# 融合三个表格
tmp = pd.merge(rating, user)
data = pd.merge(tmp, movie)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=2021, test_size=10000)

In [8]:
movies_train = {}
users_train = {}
for i in range(len(train)):
    sample = train.iloc[i]
    if sample['user_id'] not in users_train.keys():
        users_train[sample['user_id']] = {'uid': sample['user_id'], 'gender': sample['user_gender'],
                                    'age': sample['user_age'], 'job': sample['user_job']}
    if sample['movie_id'] not in movies_train.keys():
        movies_train[sample['movie_id']] = {'mid': sample['movie_id'], 'mtype': sample['movie_type'],
                                      'mtext': sample['movie_title']}


In [9]:
users_train = dict(sorted(users_train.items(), key=lambda ele: ele[0], reverse=False))
movies_train = dict(sorted(movies_train.items(), key=lambda ele: ele[0], reverse=False))
user_index_to_uid = list(users_train.keys())
movie_index_to_mid = list(movies_train.keys())

pkl.dump(user_index_to_uid, open(args.path + 'user_index_to_uid.pkl', 'wb'))
pkl.dump(movie_index_to_mid, open(args.path + 'movie_index_to_mid.pkl', 'wb'))
pkl.dump(train, open(args.path + 'train.pkl', 'wb'))
pkl.dump(test, open(args.path + 'test.pkl', 'wb'))

In [10]:
choice_matrix = pd.DataFrame(np.zeros([6040, 3706], dtype=float))
rank_matrix_initial = pd.DataFrame(np.zeros([6040, 3706], dtype=float))

for i in range(len(train)):
    sample = train.iloc[i]
    uid = sample['user_id']
    mid = sample['movie_id']
    rank = sample['rank']
    user_index = user_index_to_uid.index(uid)
    movie_index = movie_index_to_mid.index(mid)
    choice_matrix[movie_index][user_index] = 1
    rank_matrix_initial[movie_index][user_index] = rank

pkl.dump(choice_matrix, open(args.path + 'choice_matrix.pkl', 'wb'))
pkl.dump(rank_matrix_initial, open(args.path + 'rank_matrix_initial.pkl', 'wb'))