In [1]:
import numpy as np
import pandas as pd
import torch 

# Data preparation
def read_data():
    r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
    df = pd.read_csv("q2.txt", sep="\t",names=r_cols,encoding="latin-1")

    # Data cleaning
    df = df.drop('timestamp', axis=1)
    return df
def get_data(split = 0.8):
    df = read_data()    
    train = df.head(int(split*df.__len__()))
    test = df.tail(int((1-split)*df.__len__()))
    return train, test

def get_matrix():
    df = read_data()
    rating_matrix = df.pivot_table(values='rating', index='user_id', columns='movie_id')
    rating_matrix = rating_matrix.fillna(0)
    R = rating_matrix
    return R

def in_matrix(user, item, matrix):
    if user+1 not in matrix:
        return False
    if item+1 not in matrix[user+1]:
        return False
    return True

def get_existence():
    result = []
    df = read_data()
    items = max(df['movie_id']) # Number of items
    users = max(df['user_id'])  # Number of users
    matrix = get_matrix()
    for u in range(users):    # items
        reviews = []
        for i in range(items):# users
            if not in_matrix(u, i, matrix):
                reviews.append(False)
                continue
            if matrix[u+1][i+1] == 0:
                reviews.append(False)
                continue
            reviews.append(True)
        result.append(reviews)
    return result

def prediction(user, item):
    return np.dot(Q[item-1], P.T[user-1])

def SE(y, y_bar):
    n = len(y) #finding total number of items in list
    summation = 0
    for i in range (0,n):  
        difference = y[i] - y_bar[i]  
        squared_difference = difference**2  
        summation = summation + squared_difference  
    SE = summation  
    return SE

def l2Norm(x):
    return np.sqrt(np.sum(np.square(x), axis=0))

def E(train):
    predictions = [prediction(d[1]['user_id'], d[1]['movie_id']) for d in train.T.iteritems()]
    cost = SE(predictions, train['rating'])
    normP = sum([l2Norm(u)**2 for u in P])
    normQ = sum([l2Norm(i)**2 for i in Q])
    normFactor = lam * (normP + normQ)
    cost += normFactor
    return cost

def single_gradient(user, item, user_tar, item_tar, exist):
    if not ((user_tar is None) ^ (item_tar is None)):
        return "Exactly one arg is needed"
    gradient = 0
    pu = P.T[user]
    qi = Q[item]
    pred = prediction(user, item)
    real = 0
    # if exist[item][user]:
    real = R[item+1][user+1]
    # else:
        # real = 0

    if user_tar is not None:
        elem = qi[user_tar]
    else:
        elem = pu[item_tar]
    gradient = 2*(real - pred) * elem
    print(real, pred, elem, gradient)
    return gradient

def user_gradient(user, feature, exist):
    result = 0
    divider = 0
    for item in range(m):
        if not exist[user][item]:
            continue
        result += single_gradient(user, item, feature, None, exist)
        divider += 1
    if divider == 0:
        return result
    return result/divider

def item_gradient(item, feature, exist):
    result = 0
    divider = 0
    for user in range(n):
        if not exist[user][item]:
            continue
        result += single_gradient(user, item, None, feature, exist)
        divider += 1
    if divider == 0:
        return result
    return result/n

def update_user(eta, exist):
    for i in range(n):
        for j in range(k):
            P[j][i] -= eta * user_gradient(i, j, exist)
        # if i%20 == 0:
            # print(i/n)

def update_item(eta, exist):
    for i in range(m):
        for j in range(k):
            Q[i][j] -= eta * item_gradient(i, j, exist)
        # if i%20 == 0:
            # print(i/m)


def learn(eta, exist):
    for i in range(iterations):
        print(i)
        print(E(train))
        update_user(eta, exist)
        update_item(eta, exist)



In [2]:
k = 20
lam = 0.1
eta = 0.1
iterations = 40
split = 1.0
df = read_data()
m = max(df['movie_id']) # Number of items
n = max(df['user_id'])  # Number of users
Q = np.random.uniform(low=0.1, high=0.9, size=(m,k)) # Item
P = np.random.uniform(low=0.1, high=0.9, size=(k,n)) # User
R = get_matrix()


# print(R[1000])
train, test = get_data(split)

exist = get_existence()


In [3]:

learn(eta, exist)

0
371608.18150952394
5.0 5.810852288429029 0.1464020443372272 -0.23742086536305762
3.0 5.068463124982097 0.1337266052025651 -0.55321710338109
3.0 5.331675090285741 0.7465108952800298 -3.481241718302706
5.0 5.970306269837204 0.7424316605371476 -1.440772190289682
3.0 6.136742373596761 0.380984430136302 -2.390100011378306
5.0 6.003529839547911 0.2623596489981077 -0.5265714729258344
5.0 5.424501158910158 0.6253919242121488 -0.530959193202222
5.0 5.7931388688034104 0.6272955946968288 -0.9950650367664109
0.0 6.538161365244378 0.26534959803217084 -3.4697969802741304
4.0 6.645384447099821 0.7304892271857922 -3.8648496807425254
4.0 5.196146923442281 0.2039769766254369 -0.4879728660871486
0.0 5.027218761448277 0.5713765814434137 -5.744870140169018
0.0 5.349558225159273 0.19162980979217076 -2.050269650318828
4.0 5.275337436411107 0.6115736674223189 -1.559925586373838
3.0 5.793725905888787 0.2666052811917934 -1.4896441614245557
3.0 6.916839402132625 0.7354871287820831 -5.761569931550111
2.0 5.6313

KeyError: 852