In [1]:
import os
import scipy.sparse as sp
import pandas as pd
import numpy as np

In [58]:
class Dataset(object):
    def __init__(self, path):
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape

    def load_rating_file_as_list(self, filename):
        df = pd.read_csv(filename, sep="\t")
        ratingList = list(zip(df.userid.tolist(), df.itemid.tolist()))
        return ratingList
    
    def load_negative_file(self, filename):
        df = pd.read_csv(filename, sep="\t")
        negativeList = df.iloc[:, 1:].values.tolist()
        return negativeList

    def load_rating_file_as_matrix(self, filename):
        df = pd.read_csv(filename, sep="\t")
        num_users = df.userid.max()
        num_items = df.itemid.max()
        mat = sp.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32)
        interactions = df[['userid', 'itemid']].values.tolist()
        # [(0, 2969), (0, 1178), (0, 1574), (0, 957)]
        for user, item in interactions:
            mat[user, item] = 1.
        # [((0, 2969), 1.0), ((0, 1178), 1.0), ((0, 1574), 1.0), ((0, 957), 1.0)]
        return mat

In [59]:
datadir = "Data_Javier"
dataname = "ml-1m"
dataset = Dataset(os.path.join(datadir, dataname))
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

In [60]:
train.shape

(6040, 3706)

In [61]:
testRatings[:10]

[(0, 47),
 (1, 1737),
 (2, 1900),
 (3, 1774),
 (4, 279),
 (5, 583),
 (6, 2891),
 (7, 3033),
 (8, 2102),
 (9, 2062)]

In [62]:
n_users, n_items = train.shape

In [63]:
n_users, n_items

(6040, 3706)

In [74]:
filename = "Data_Javier/ml-1m.train.rating"
df = pd.read_csv(filename, sep="\t")
df.head()

Unnamed: 0,userid,itemid,rating,timestamp
0,0,2969,4,978300019
1,0,1178,5,978300055
2,0,1574,4,978300055
3,0,957,5,978300055
4,0,2147,3,978300103


In [78]:
u_0 = df[df["userid"] == 0]
u_0.head()

Unnamed: 0,userid,itemid,rating,timestamp
0,0,2969,4,978300019
1,0,1178,5,978300055
2,0,1574,4,978300055
3,0,957,5,978300055
4,0,2147,3,978300103


In [82]:
u0_items = list(u_0['itemid'])

In [87]:
u0_items

[2969,
 1178,
 1574,
 957,
 2147,
 1658,
 3177,
 2599,
 1117,
 1104,
 689,
 253,
 858,
 593,
 2488,
 1781,
 1848,
 2889,
 877,
 970,
 1782,
 1838,
 144,
 963,
 1025,
 853,
 1195,
 2592,
 2557,
 1154,
 639,
 2710,
 517,
 2898,
 2586,
 2128,
 964,
 1107,
 580,
 2205,
 1421,
 513,
 581,
 2483,
 708,
 574,
 0,
 2162,
 2102,
 740,
 1439,
 1727]

In [88]:
len(u0_items)

52

In [83]:
u0_items[:10]

[2969, 1178, 1574, 957, 2147, 1658, 3177, 2599, 1117, 1104]

In [85]:
3184 in u0_items

False

In [86]:
3478 in u0_items

False

In [69]:
filename = "Data_Javier/ml-1m.test.negative"
df = pd.read_csv(filename, sep="\t")
df.head()

Unnamed: 0,positive,item_n0,item_n1,item_n2,item_n3,item_n4,item_n5,item_n6,item_n7,item_n8,...,item_n89,item_n90,item_n91,item_n92,item_n93,item_n94,item_n95,item_n96,item_n97,item_n98
0,"(0, 47)",3184,3478,1694,3377,340,2111,2833,382,2152,...,2486,428,416,799,2623,3346,956,294,2309,1867
1,"(1, 1737)",683,15,369,1630,777,899,2465,2906,2175,...,235,3515,3255,1878,2017,144,2000,2894,27,1146
2,"(2, 1900)",478,3184,2477,1061,2196,416,976,1623,3263,...,1492,1994,2294,3431,2735,2996,3442,1587,277,2588
3,"(3, 1774)",1085,2309,1203,3366,3661,3241,2349,2363,2903,...,1768,752,2217,1222,86,1469,3074,479,838,1824
4,"(4, 279)",476,1002,596,2478,2382,2791,704,1097,624,...,1527,78,958,2780,3703,3261,3343,1358,3610,2382


In [71]:
negativeList = df.iloc[:, 1:].values.tolist()
negativeList[:2]

[[3184,
  3478,
  1694,
  3377,
  340,
  2111,
  2833,
  382,
  2152,
  597,
  1653,
  715,
  888,
  1662,
  141,
  1815,
  1018,
  3613,
  3563,
  2959,
  1580,
  3245,
  1164,
  3604,
  1309,
  83,
  3536,
  1234,
  1961,
  2337,
  903,
  1880,
  2730,
  1364,
  3474,
  386,
  1835,
  297,
  3239,
  3466,
  1796,
  660,
  1537,
  1337,
  3695,
  2073,
  3257,
  857,
  1351,
  2901,
  3252,
  218,
  1460,
  3394,
  627,
  3107,
  1926,
  270,
  551,
  1468,
  884,
  2314,
  2684,
  3009,
  3063,
  1909,
  3046,
  2414,
  3488,
  713,
  2423,
  2367,
  370,
  3649,
  2143,
  3301,
  826,
  3199,
  624,
  2184,
  223,
  1674,
  2463,
  419,
  2801,
  3031,
  656,
  2141,
  1258,
  2486,
  428,
  416,
  799,
  2623,
  3346,
  956,
  294,
  2309,
  1867],
 [683,
  15,
  369,
  1630,
  777,
  899,
  2465,
  2906,
  2175,
  3348,
  3106,
  601,
  403,
  3113,
  3604,
  406,
  3667,
  1676,
  1486,
  255,
  1622,
  3678,
  951,
  3452,
  281,
  547,
  2600,
  2138,
  1210,
  2227,
  1415,
  

In [64]:
n_neg = 4

In [65]:
def get_train_instances(train, n_items, n_neg, testNegatives):
    user, item, labels = [],[],[]
    n_users = train.shape[0]
    for (u, i) in train.keys():
        # 有过交互的，label 是 1
        user.append(u)
        item.append(i)
        labels.append(1)
        # 随机选择一个 item，如果这个 item 用户没有交互过，则 label 为 0
        # 在测试集的负采样的样本中，用户u和item也没有过交互
        for t in range(n_neg):
            j = np.random.randint(n_items)
            while ((u, j) in train.keys()) or (j in testNegatives[u]):
                j = np.random.randint(n_items)
            user.append(u)
            item.append(j)
            labels.append(0)
    return np.array(user), np.array(item), np.array(labels)

In [66]:
user, item, labels = get_train_instances(train, n_items, n_neg)

In [67]:
labels[:100]

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])