In [1]:
import os
import scipy.sparse as sp
import pandas as pd
import numpy as np

In [58]:
class Dataset(object):
    def __init__(self, path):
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape

    def load_rating_file_as_list(self, filename):
        df = pd.read_csv(filename, sep="\t")
        ratingList = list(zip(df.userid.tolist(), df.itemid.tolist()))
        return ratingList
    
    def load_negative_file(self, filename):
        df = pd.read_csv(filename, sep="\t")
        negativeList = df.iloc[:, 1:].values.tolist()
        return negativeList

    def load_rating_file_as_matrix(self, filename):
        df = pd.read_csv(filename, sep="\t")
        num_users = df.userid.max()
        num_items = df.itemid.max()
        mat = sp.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32)
        interactions = df[['userid', 'itemid']].values.tolist()
        # [(0, 2969), (0, 1178), (0, 1574), (0, 957)]
        for user, item in interactions:
            mat[user, item] = 1.
        # [((0, 2969), 1.0), ((0, 1178), 1.0), ((0, 1574), 1.0), ((0, 957), 1.0)]
        return mat

In [59]:
datadir = "Data_Javier"
dataname = "ml-1m"
dataset = Dataset(os.path.join(datadir, dataname))
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

In [60]:
train.shape

In [61]:
testRatings[:10]

In [62]:
n_users, n_items = train.shape

In [63]:
n_users, n_items

In [74]:
filename = "Data_Javier/ml-1m.train.rating"
df = pd.read_csv(filename, sep="\t")
df.head()

In [78]:
u_0 = df[df["userid"] == 0]
u_0.head()

In [82]:
u0_items = list(u_0['itemid'])

In [87]:
u0_items

In [88]:
len(u0_items)

In [83]:
u0_items[:10]

In [85]:
3184 in u0_items

In [86]:
3478 in u0_items

In [69]:
filename = "Data_Javier/ml-1m.test.negative"
df = pd.read_csv(filename, sep="\t")
df.head()

In [71]:
negativeList = df.iloc[:, 1:].values.tolist()
negativeList[:2]

In [64]:
n_neg = 4

In [65]:
def get_train_instances(train, n_items, n_neg, testNegatives):
    user, item, labels = [],[],[]
    n_users = train.shape[0]
    for (u, i) in train.keys():
        # 有过交互的，label 是 1
        user.append(u)
        item.append(i)
        labels.append(1)
        # 随机选择一个 item，如果这个 item 用户没有交互过，则 label 为 0
        # 在测试集的负采样的样本中，用户u和item也没有过交互
        for t in range(n_neg):
            j = np.random.randint(n_items)
            while ((u, j) in train.keys()) or (j in testNegatives[u]):
                j = np.random.randint(n_items)
            user.append(u)
            item.append(j)
            labels.append(0)
    return np.array(user), np.array(item), np.array(labels)

In [66]:
user, item, labels = get_train_instances(train, n_items, n_neg)

In [67]:
labels[:100]