In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = "/content/drive/My Drive/M2/AFMatriciel"

### **Data Compilation**

In [0]:
import sys
import os
from scipy.sparse import dok_matrix, csr_matrix
from scipy import io
import tarfile



#################################################
total_no_users = 2649429
total_no_movies = 17770

def process_content(content, D):
    lines = content.split("\n")
    id_movie = int(lines[0][:-1]) - 1
    for i in range(1, len(lines)):
        if lines[i] != '':
            line = lines[i].split(",")
            id_user = int(line[0]) - 1
            rating = int(line[1])
            D[id_user, id_movie] = rating
    return D


def rating_compiler(folder_name, out_path):
    D = dok_matrix((total_no_users, total_no_movies))
    res_listdir = os.listdir(folder_name)
    number = len(res_listdir)
    i = 0
    for f in res_listdir:
        if os.path.isfile(folder_name+f):
            print(i, " / ", number)
            myfile = open(folder_name+f)
            content = myfile.read()
            myfile.close()
            D = process_content(content, D)
        i += 1
    D = csr_matrix(D)             
    io.savemat(out_path, {'X' : D})


def rating_compiler2(tar_name, out_path):
    D = dok_matrix((total_no_users, total_no_movies))
    tar = tarfile.open(tar_name)
    res_getmembers = tar.getmembers()
    number = len(res_getmembers)
    i = 0
    for member in res_getmembers:
        f = tar.extractfile(member)
        if f is not None:    
            print(i, " / ", number)        
            content = f.read()
            f.close()
            D = process_content(content.decode(), D)
        i += 1
    tar.close()
    D = csr_matrix(D)             
    io.savemat(out_path, {'X' : D})


def extract_T_and_R(D_file_name, qualifying_file_name, out_T_path, out_R_path):
    D = io.loadmat(D_file_name)['X']
    myfile = open(qualifying_file_name)
    content = myfile.read()
    myfile.close()
    lines = content.split("\n")
    users, movies = set(), set()
    for line in lines:
        if line != '':
            line_split = line.split(",")
            if len(line_split) == 1:
                # Movie id
                movies.add(int(line_split[0][:-1]) - 1)
            else:
                # User id
                users.add(int(line_split[0]) - 1)
    T = D[list(users),:]
    T = T[:,list(movies)]    
    io.savemat(out_T_path, {'X' : T})
    
    movies2 = set(range(total_no_movies))
    movies2 = movies2.difference(movies)
    users2 = set(range(total_no_users))
    users2 = users2.difference(users)
    
    R = D[list(users2),:]
    R = R[:,list(movies2)]
    io.savemat(out_R_path, {'X' : R})
    



#################################################
if __name__ == "__main__":
    rating_compiler2(path+"/download/training_set.tar", path+"/D.mat")
    extract_T_and_R(path+"/D.mat", path+"/download/qualifying.txt", path+"/T.mat", path+"/R.mat")

### **1. Baseline Estimates**

In [60]:
from scipy import io, sparse
import numpy as np


#################################################
def compute_loss(mat, mu, bu, bi, l_reg=0.02):

  no_users_entries = np.array((mat != 0).sum(1)).T.ravel()
  bu_rep = np.repeat(bu, no_users_entries)

  no_movies_entries = np.array((mat != 0).sum(0)).ravel()
  bi_rep = np.repeat(bi, no_movies_entries)

  mat = sparse.csr_matrix(mat)
  mat.data[:] -= bi_rep
  mat = sparse.csc_matrix(mat)
  mat.data[:] -= bu_rep
  mat.data[:] -= mu

  loss = sum(mat.data[:] ** 2)

  reg = l_reg * (sum(bu**2) + sum(bi**2))  
  loss += reg
  return loss

def baseline_estimator(mat_file, l_reg=0.02):

  mat = io.loadmat(mat_file)['X']
  print(mat.shape)
  no_users = mat.shape[0]
  no_movies = mat.shape[1]
  bu = np.zeros(no_users)
  bi = np.zeros(no_movies)
  mu = mat.mean()

  print("compute loss...")
  loss = compute_loss(mat, mu, bu, bi, l_reg)
  print(loss)

  """n_iter = 200
  for it in range(n_iter):
    print(it)
    bu -= - 2 * """


#################################################
if __name__ == "__main__":
  
  baseline_estimator(path+"/T.mat")

(478615, 17470)
compute loss...
1392049133.237792


In [59]:
from scipy import io, sparse
import numpy as np

mat = io.loadmat(path+"/T.mat")['X']
print(mat.shape)

mat = mat[:1500,:1000]
print(mat.shape)
print(mat.count_nonzero())
print(mat.sum())

bu = np.ones(mat.shape[0])
bi = np.ones(mat.shape[1])

no_users_entries = np.array((mat != 0).sum(1)).T.ravel()
bu_rep = np.repeat(bu, no_users_entries)

no_movies_entries = np.array((mat != 0).sum(0)).ravel()
bi_rep = np.repeat(bi, no_movies_entries)

mat = sparse.csr_matrix(mat)
mat.data[:] -= bi_rep
mat = sparse.csc_matrix(mat)
mat.data[:] -= bu_rep

print("========")
sum(mat.data[:] ** 2)

(478615, 17470)
(1500, 1000)
15409
55043.0


55513.0

### **2. Correlation-Based Neighbourhood Model**

### **3. Correlation-Based Neighbourhood Model with Implicit Feedback**

### **4. SVD++**

### **5. Integrated Model**