In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
path = "/content/drive/My Drive/M2/AFMatriciel"

### **Data Compilation**

In [0]:
import sys
import os
from scipy.sparse import dok_matrix, csr_matrix
from scipy import io
import tarfile



#################################################
total_no_users = 2649429
total_no_movies = 17770

def process_content(content, D):
    lines = content.split("\n")
    id_movie = int(lines[0][:-1]) - 1
    for i in range(1, len(lines)):
        if lines[i] != '':
            line = lines[i].split(",")
            id_user = int(line[0]) - 1
            rating = int(line[1])
            D[id_user, id_movie] = rating
    return D


def rating_compiler(folder_name, out_path):
    D = dok_matrix((total_no_users, total_no_movies))
    res_listdir = os.listdir(folder_name)
    number = len(res_listdir)
    i = 0
    for f in res_listdir:
        if os.path.isfile(folder_name+f):
            print(i, " / ", number)
            myfile = open(folder_name+f)
            content = myfile.read()
            myfile.close()
            D = process_content(content, D)
        i += 1
    D = csr_matrix(D)             
    io.savemat(out_path, {'X' : D})


def rating_compiler2(tar_name, out_path):
    D = dok_matrix((total_no_users, total_no_movies))
    tar = tarfile.open(tar_name)
    res_getmembers = tar.getmembers()
    number = len(res_getmembers)
    i = 0
    for member in res_getmembers:
        f = tar.extractfile(member)
        if f is not None:    
            print(i, " / ", number)        
            content = f.read()
            f.close()
            D = process_content(content.decode(), D)
        i += 1
    tar.close()
    D = csr_matrix(D)             
    io.savemat(out_path, {'X' : D})


def extract_T_and_R(D_file_name, qualifying_file_name, out_T_path, out_R_path):
    D = io.loadmat(D_file_name)['X']
    myfile = open(qualifying_file_name)
    content = myfile.read()
    myfile.close()
    lines = content.split("\n")
    users, movies = set(), set()
    for line in lines:
        if line != '':
            line_split = line.split(",")
            if len(line_split) == 1:
                # Movie id
                movies.add(int(line_split[0][:-1]) - 1)
            else:
                # User id
                users.add(int(line_split[0]) - 1)
    T = D[list(users),:]
    T = T[:,list(movies)]    
    io.savemat(out_T_path, {'X' : T})
    
    movies2 = set(range(total_no_movies))
    movies2 = movies2.difference(movies)
    users2 = set(range(total_no_users))
    users2 = users2.difference(users)
    
    R = D[list(users2),:]
    R = R[:,list(movies2)]
    io.savemat(out_R_path, {'X' : R})
    



#################################################
if __name__ == "__main__":
    rating_compiler2(path+"/download/training_set.tar", path+"/D.mat")
    extract_T_and_R(path+"/D.mat", path+"/download/qualifying.txt", path+"/T.mat", path+"/R.mat")

### **1. Baseline Estimates**

In [3]:
from scipy import io, sparse
import numpy as np
from itertools import groupby
from operator import itemgetter

mat = io.loadmat(path+"/T.mat")['X']
# Pre-processing
print("Pre-processing...")
mat_nonzero = mat.nonzero()

print("   make bi...")
bi_index = []
for k, g in groupby(zip(mat_nonzero[0], mat_nonzero[1]), itemgetter(0)):
  to_add = list(map(lambda x:int(x[1]), list(g)))
  bi_index.append(to_add)    

print("   make bu...")
bu_index = []
indexes = np.argsort(mat_nonzero[1])
for k, g in groupby(zip(mat_nonzero[1][indexes], mat_nonzero[0][indexes]), itemgetter(0)):
  to_add = list(map(lambda x:int(x[1]), list(g)))
  bu_index.append(to_add)    

Pre-processing...
   make bi...
   make bu...


In [0]:
from scipy import io, sparse
import numpy as np
from itertools import groupby
from operator import itemgetter

#################################################
def compute_loss(mat, mu, bu, bi, l_reg=0.02):
  loss = 0

  no_users_entries = np.array((mat != 0).sum(1)).T.ravel()
  bu_rep = np.repeat(bu.ravel(), no_users_entries)

  no_movies_entries = np.array((mat != 0).sum(0)).ravel()
  bi_rep = np.repeat(bi.ravel(), no_movies_entries)

  temp_mat = sparse.csc_matrix(mat).copy()
  temp_mat.data[:] -= bi_rep
  temp_mat.data[:] -= mu
  temp_mat = sparse.coo_matrix(temp_mat)
  temp_mat = sparse.csr_matrix(temp_mat)
  temp_mat.data[:] -= bu_rep

  loss = (temp_mat.data[:] ** 2).sum()

  reg = l_reg * ((bu**2).sum() + (bi**2).sum())  
  loss += reg

  return loss

def baseline_estimator(mat_file, l_reg=0.02, learning_rate=0.000001):

  mat = io.loadmat(mat_file)['X']
  print(mat.shape)
  no_users = mat.shape[0]
  no_movies = mat.shape[1]
  
  bu = np.random.rand(no_users,1)  * 2 - 1
  bi = np.random.rand(1,no_movies) * 2 - 1
  #bu = np.zeros((no_users,1))
  #bi = np.zeros((1,no_movies))  

  mu = mat.data[:].mean()
  mat_sum1 = mat.sum(1)
  mat_sum0 = mat.sum(0)
  n = mat.data[:].shape[0]

  no_users_entries = np.array((mat != 0).sum(1))
  no_movies_entries = np.array((mat != 0).sum(0))

  # Train
  print("Train...")
  n_iter = 200
  for it in range(n_iter):

    #bi_sum = bi[bi_index].sum(1).reshape((no_users,1))
    #bu_sum = bu.ravel()[bu_index].sum(0).reshape((1,no_movies)) 

    bi_sum = np.array(list(map(lambda x:bi.ravel()[x].sum(), bi_index))).reshape((no_users,1))
    bu_sum = np.array(list(map(lambda x:bu.ravel()[x].sum(), bu_index))).reshape((1,no_movies))    

    bu_gradient = - 2.0 * (mat_sum1 - no_users_entries  * mu - no_users_entries  * bu - bi_sum) + 2.0 * l_reg * bu
    bu -= learning_rate * bu_gradient 

    bi_gradient = - 2.0 * (mat_sum0 - no_movies_entries * mu - no_movies_entries * bi - bu_sum) + 2.0 * l_reg * bi
    bi -= learning_rate * bi_gradient 

    print(it, "\ ", n_iter)    
    """print(bu.mean())    
    print(bi.mean())    
    print(bu_gradient.mean())
    print(bi_gradient.mean())"""
    if it % 10 == 0:
      """print(bu)
      print(bi)      
      print(bu_gradient)
      print(bi_gradient)"""
      print("compute loss...")
      print(compute_loss(mat, mu, bu, bi, l_reg=l_reg))


#################################################
if __name__ == "__main__":
  
  baseline_estimator(path+"/T.mat")

(478615, 17470)
Train...
0 \  200
compute loss...
175256610.21368694
1 \  200
2 \  200
3 \  200
4 \  200
5 \  200
6 \  200
7 \  200
8 \  200
9 \  200
10 \  200
compute loss...
150296200.52598408
11 \  200
12 \  200
13 \  200
14 \  200


In [11]:
from scipy import io, sparse
import numpy as np

o_mat = io.loadmat(path+"/T.mat")
mat = o_mat['X']
print(mat.shape)

temp = mat.data[:].copy()
print(mat.data[:])
print(temp)
mat.data[:] -= 5
print(mat.data[:])
print(temp)

input()

"""print(mat[:,0].nonzero())
input()

temp = np.array([[1,2,3],[4,5,5]])
temp = [[1,2,3],[4,5,6]]
bi = np.ones(mat.shape[1])
bu = np.ones(mat.shape[0])
bi[5] = 5"""



"""print(np.array(list(map(lambda x:bi[x].sum(), temp))))

input()

#print(temp.shape)
print(bi.shape)
print(bi[temp])
print(bi[temp].sum(1).reshape((2,1)))

input()"""

mat_nonzero = mat.nonzero()

from itertools import groupby
from operator import itemgetter

indexes = np.argsort(mat_nonzero[1])

uniquekeys = []
groups = []
for k, g in groupby(zip(mat_nonzero[1][indexes], mat_nonzero[0][indexes]), itemgetter(0)):
    print(list(g))
    input()
    to_add = list(map(lambda x:int(x[1]), list(g)))
    groups.append(to_add)    # Store group iterator as a list
    uniquekeys.append(k)

#groups = list(groupby(zip(mat_nonzero[0], mat_nonzero[1]), itemgetter(0)))

print(np.array(uniquekeys).shape)
print(groups[0])
print(bi[groups].sum(0))

"""mat = mat[:1500,:1000]
print(mat.shape)
print(mat.count_nonzero())
print(mat.sum())
print(mat.sum(0).shape)

bu = np.ones((mat.shape[0],1))
bi = np.ones(mat.shape[1])

print(bu.shape)
print((bu**2).sum())"""

"""no_users_entries = np.array((mat != 0).sum(1)).T.ravel()
bu_rep = np.repeat(bu, no_users_entries)

no_movies_entries = np.array((mat != 0).sum(0)).ravel()
bi_rep = np.repeat(bi, no_movies_entries)

mat = sparse.csr_matrix(mat)
mat.data[:] -= bi_rep
mat = sparse.csc_matrix(mat)
mat.data[:] -= bu_rep

print("========")
sum(mat.data[:] ** 2)"""



(478615, 17470)
[5. 3. 3. ... 3. 3. 3.]
[5. 3. 3. ... 3. 3. 3.]
[ 0. -2. -2. ... -2. -2. -2.]
[5. 3. 3. ... 3. 3. 3.]


KeyboardInterrupt: ignored

### **2. Correlation-Based Neighbourhood Model**

### **3. Correlation-Based Neighbourhood Model with Implicit Feedback**

### **4. SVD++**

### **5. Integrated Model**