In [1]:
import pandas as pd
import torch
from tqdm import tqdm

In [2]:
user_df = pd.read_csv("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\users.csv")
movie_df = pd.read_csv("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\movies.csv")
rating_df = pd.read_csv("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\ratings.csv").drop("TimeStamp", axis=1)
rating_df.head()

Unnamed: 0,UserID,MovieID,Rating,Month
0,1,1193,5,9
1,1,661,3,9
2,1,914,3,9
3,1,3408,4,9
4,1,2355,5,9


# Preprocess users' side information

In [3]:
user_df.head()

Unnamed: 0,UserID,Gender,Age,0,1,2,3,4,5,6,...,11,12,13,14,15,16,17,18,19,20
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,56,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1,25,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,1,45,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
max(user_df["UserID"]), len(set(user_df["UserID"]))

(6040, 6040)

In [5]:
max(rating_df["UserID"]), len(set(rating_df["UserID"]))

(6040, 6040)

rating与user中包含的UserID**一致且连续**。

In [6]:
# normalize
user_df["Age"] = (user_df["Age"] - user_df["Age"].min()) / (user_df["Age"].max() - user_df["Age"].min())
user_df.head()

Unnamed: 0,UserID,Gender,Age,0,1,2,3,4,5,6,...,11,12,13,14,15,16,17,18,19,20
0,1,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1,0.436364,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,1,0.8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,0.436364,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
user_matrix = user_df.sort_values(by="UserID").iloc[:, 1:].to_numpy()
user_matrix.shape

(6040, 23)

In [8]:
np.savetxt("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\user_matrix.csv", user_matrix, delimiter=",")

# Preprocess items' side information

In [9]:
max(movie_df["MovieID"]), len(set(movie_df["MovieID"]))

(3952, 3883)

In [10]:
max(rating_df["MovieID"]), len(set(rating_df["MovieID"]))

(3952, 3706)

In [None]:
for i in list(set(rating_df["MovieID"])):
    if i not in set(movie_df["MovieID"]):
        print("Missing!")

rating与movie中包含的MovieID不一致且不连续，具体而言，rating中出现的movie更少，即rating中出现的所有movie的信息都在movie.csv中找到与之对应的。因此，**只需要考虑rating中出现的movie即可**，且为了存储方便应该建立起原本movie_id与存储索引的映射关系。

In [None]:
# 构建一个从小到大的排列序号与实际MovieID映射的dict (movie_id -> order)
id_list = sorted(list(set(rating_df["MovieID"])))
id2order_dict = {}
for (i, id_) in enumerate(id_list):
    id2order_dict[id_] = i

In [None]:
id2order_dict

In [None]:
# select the movies which appear in the rating data
movie_df = movie_df[movie_df["MovieID"].isin(list(set(rating_df["MovieID"])))]
movie_df.shape

In [None]:
item_matrix = movie_df.sort_values(by="MovieID").iloc[:, 2:].to_numpy()
item_matrix.shape

In [None]:
# save the ndnarray as a comma separated .csv
np.savetxt("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\item_matrix.csv", item_matrix, delimiter=",")

# Convert into a rating tensor

In [None]:
# tmp = rating_df[rating_df["UserID"] <= 10]
# tmp.sort_values(["Month", "UserID", "MovieID"])
# tmp.sort_values(["Month", ]).\
#     groupby(["Month", "UserID"])["Rating"].apply(np.array)

In [None]:
# user_list = list(set(rating_df["UserID"]))
# movie_list = list(set(rating_df["MovieID"]))
# month_list = list(set(rating_df["Month"]))

# tmp_df = pd.DataFrame(columns=["UserID", "MovieID", "Month"])
# for month in tqdm(month_list):
#     for user in user_list[:1]:
#         for movie in movie_list:
#             row = pd.DataFrame([dict(UserID=user, MovieID=movie,
#                                     Month=month)])
#             tmp_df.append(row, ignore_index=True)

In [3]:
user_list = list(set(rating_df["UserID"]))
movie_list = list(set(rating_df["MovieID"]))
month_list = list(set(rating_df["Month"]))
BATCH_SIZE = 3      # 按照时间进行batch划分
# 在对应时间段内的rating dataframe
# batch_rating_df = rating_df[rating_df["Month"] < BATCH_SIZE]

In [None]:
for i in tqdm(range(36 // BATCH_SIZE)):
    batch_rating_df = rating_df[(rating_df["Month"] >= (i*BATCH_SIZE)) & (rating_df["Month"] < ((i+1)*BATCH_SIZE))]
    a = pd.DataFrame({'UserID': user_list, 
                      'key':[1 for _ in range(len(user_list))]})
    b = pd.DataFrame({"MovieID": movie_list,\
                      'key':[1 for _ in range(len(movie_list))]})
    c = pd.DataFrame({"Month": month_list[:BATCH_SIZE], 
                      'key': [1 for _ in range(i*BATCH_SIZE, (i+1)*BATCH_SIZE)]}) 
    # batch上user,movie和time的笛卡尔积
    batch_dec = a.merge(b, on='key').merge(c, on="key").drop("key", axis=1)
    # 将两者合并，使用Nan对空缺的地方进行填充
    tmp_rating_df = batch_rating_df.merge(batch_dec, 
                                         on=["UserID", "MovieID", "Month"],
                                         how="right").sort_values(["Month", "UserID", "MovieID"])
    
    tmp_rating_tensor = tmp_rating_df.groupby(["Month", "UserID"])["Rating"].apply(list)
    tmp_rating_tensor = np.array(list(tmp_rating_tensor))
    # reshape as (batch_NT, N_u, N_v)
    tmp_rating_tensor = tmp_rating_tensor.reshape((BATCH_SIZE, 
                                                   (6040*BATCH_SIZE)//BATCH_SIZE, 3706))
    np.savetxt("C:\\Users\\maoru\\Desktop\\Yao Wang\\data\\rating_tensor\\rating_tensor_%d.csv" % (i), item_matrix, delimiter=",")

  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

In [None]:
# a = pd.DataFrame({'UserID': user_list, 
#                   'key':[1 for _ in range(len(user_list))]})
# b = pd.DataFrame({"MovieID": movie_list,\
#                   'key':[1 for _ in range(len(movie_list))]})
# c = pd.DataFrame({"Month": month_list[:BATCH_SIZE], 
#                   'key': [1 for _ in range(BATCH_SIZE)]})

# # batch上user,movie和time的笛卡尔积
# batch_dec = a.merge(b, on='key').merge(c, on="key").drop("key", axis=1)

In [None]:
# # 将两者合并，使用Nan对空缺的地方进行填充
# tmp_rating_df = batch_rating_df.merge(batch_dec, 
#                                      on=["UserID", "MovieID", "Month"],
#                                      how="right").sort_values(
#     ["Month", "UserID", "MovieID"])
# tmp_rating_df

In [None]:
tmp_rating_tensor = tmp_rating_df.groupby(["Month", "UserID"])["Rating"].apply(list)
tmp_rating_tensor = np.array(list(tmp_rating_tensor))
# reshape as (batch_NT, N_u, N_v)
tmp_rating_tensor = tmp_rating_tensor.reshape((BATCH_SIZE, 
                                               (6040*BATCH_SIZE)//BATCH_SIZE, 3706))

In [None]:
tmp_rating_tensor

In [None]:
tmp_rating_tensor.shape

In [None]:
tmp_rating_mask = 1 - np.isnan(tmp_rating_tensor)
tmp_rating_mask

In [None]:
tmp_rating_tensor[np.isnan(tmp_rating_tensor)] = 0
tmp_rating_tensor

Below is some experiments I conducted through the use of tucker decomposition in the package tensorly.

In [None]:
from tensorly.decomposition import _tucker

In [None]:
# tucker decomposition with missing values
# C, U, V are factor matrices corresponding to time, user, movie separately
G, factor_matrics = _tucker.tucker(tensor=tmp_rating_tensor, 
                                   rank=[2, 23, 17], mask=tmp_rating_mask)

In [None]:
G.shape

In [None]:
C, U, V = factor_matrics
C.shape, U.shape, V.shape

In [None]:
C