# Setting up Code

In [1]:
import csv
import numpy
import matplotlib.pyplot as plt
from numpy import genfromtxt
import math
from collections import defaultdict as defaultdict
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import NearestNeighbors

In [2]:
m_r = {}
u_d = defaultdict(dict)

In [3]:
with open('ml-100k/u.data') as csvfile:
    u_data = csv.reader(csvfile, delimiter='\t')
    for row in u_data:
        if row[1] not in m_r:
            m_r[row[1]] = []
        m_r[row[1]].append([row[0], row[2], row[3]])
        
        u_d[row[0]][row[1]] = row[2]

In [4]:
fill = 4

In [5]:
sk_mat1 = numpy.full((len(u_d), len(m_r)), fill)
test_mat1 = numpy.zeros((len(u_d), len(m_r)))
with open('ml-100k/u1.base') as csvfile:
    u1_base = csv.reader(csvfile, delimiter='\t')
    for row in u1_base:
        sk_mat1[int(row[0])-1, int(row[1])-1] = row[2]
        
with open('ml-100k/u1.test') as csvfile:
    u1_test = csv.reader(csvfile, delimiter='\t')
    for row in u1_test:
        test_mat1[int(row[0])-1, int(row[1])-1] = row[2]

In [6]:
sk_mat2 = numpy.full((len(u_d), len(m_r)), fill)
test_mat2 = numpy.zeros((len(u_d), len(m_r)))
with open('ml-100k/u2.base') as csvfile:
    u2_base = csv.reader(csvfile, delimiter='\t')
    for row in u2_base:
        sk_mat2[int(row[0])-1, int(row[1])-1] = row[2]
        
with open('ml-100k/u2.test') as csvfile:
    u2_test = csv.reader(csvfile, delimiter='\t')
    for row in u2_test:
        test_mat2[int(row[0])-1, int(row[1])-1] = row[2]

In [7]:
sk_mat3 = numpy.full((len(u_d), len(m_r)), fill)
test_mat3 = numpy.zeros((len(u_d), len(m_r)))
with open('ml-100k/u3.base') as csvfile:
    u3_base = csv.reader(csvfile, delimiter='\t')
    for row in u3_base:
        sk_mat3[int(row[0])-1, int(row[1])-1] = row[2]
        
with open('ml-100k/u3.test') as csvfile:
    u3_test = csv.reader(csvfile, delimiter='\t')
    for row in u3_test:
        test_mat3[int(row[0])-1, int(row[1])-1] = row[2]

In [8]:
sk_mat4 = numpy.full((len(u_d), len(m_r)), fill)
test_mat4 = numpy.zeros((len(u_d), len(m_r)))
with open('ml-100k/u4.base') as csvfile:
    u4_base = csv.reader(csvfile, delimiter='\t')
    for row in u4_base:
        sk_mat4[int(row[0])-1, int(row[1])-1] = row[2]
        
with open('ml-100k/u4.test') as csvfile:
    u4_test = csv.reader(csvfile, delimiter='\t')
    for row in u4_test:
        test_mat4[int(row[0])-1, int(row[1])-1] = row[2]

In [9]:
sk_mat5 = numpy.full((len(u_d), len(m_r)), fill)
test_mat5 = numpy.zeros((len(u_d), len(m_r)))
with open('ml-100k/u5.base') as csvfile:
    u5_base = csv.reader(csvfile, delimiter='\t')
    for row in u5_base:
        sk_mat5[int(row[0])-1, int(row[1])-1] = row[2]
        
with open('ml-100k/u5.test') as csvfile:
    u5_test = csv.reader(csvfile, delimiter='\t')
    for row in u5_test:
        test_mat5[int(row[0])-1, int(row[1])-1] = row[2]

In [10]:
first_avg = {}
for i in m_r:
    first_avg[i] = sum(int(j[1]) for j in m_r[i])/len(m_r[i])
    
avg_total = sum(first_avg[i] for i in first_avg)/len(first_avg)
print(avg_total)

3.0760445083251815


In [11]:
m_r_avg = {}
for i in m_r:
    row = numpy.array(m_r[i]).astype(int)
    v = len(row)
    thresh = 6
    # The following formula is the iMDB weighted rating formula used for their movies.
    # 'weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C' -- 03/12/2019
    # https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV#
    # Used below:
    m_r_avg[i] = [(v/(v+thresh))*(sum(row[:,1])/v) + (thresh/(v+thresh)*avg_total)]

In [12]:
def knn(k, mat):
    func = NearestNeighbors(k, metric='canberra')
    func.fit(mat)
    return func.kneighbors()

In [13]:
knn_mat = knn(100, sk_mat1)

In [14]:
def weight_collapse(sk_mat, knn_mat):
    ul_mat = sk_mat[knn_mat[1]]
    inv_dist = 1/knn_mat[0]
    inv_dist = inv_dist/inv_dist.sum(axis=1, keepdims=True)
    sum(inv_dist[:,0])

    u_r = numpy.einsum("abc, ab -> ac", ul_mat, inv_dist)
    return u_r

In [15]:
u_r = weight_collapse(sk_mat1, knn_mat)

In [16]:
m_names = {}
with open('ml-100k/u.item') as csvfile:
    u_item = csv.reader(csvfile, delimiter='|')
    
    for row in u_item:
        m_names[row[0]] = row[1]
        m_r_avg[row[0]].append(row[5:])
        

In [17]:
def rec_mov(user_id, amount):
    arr = u_r[user_id-1]
    ret = arr.argsort()[-amount:][::-1]
    
    for i in ret:
        print(m_names[str(i+1)], arr[i])

In [18]:
rec_mov(5, 10)

Star Wars (1977) 4.0703392900851565
Pulp Fiction (1994) 4.069134922451312
Titanic (1997) 4.0497978729788775
Casablanca (1942) 4.049735966606287
English Patient, The (1996) 4.041700101545392
Close Shave, A (1995) 4.040534570044467
Toy Story (1995) 4.040118800163246
Shawshank Redemption, The (1994) 4.0399888324631075
G.I. Jane (1997) 4.039672111004958
Schindler's List (1993) 4.039582547678476


In [19]:
def mae(base, test):
    total = 0
    count = 0
    for i in range(base.shape[0]):
        for j in range(base.shape[1]):
            if test[i][j] != 0:
                total += numpy.absolute(base[i,j] - test[i,j])
                count += 1
    return total/count

In [20]:
mae(u_r, test_mat1)

0.9100193635521082

# With extra User Features

In [21]:
occupations = genfromtxt('ml-100k/u.occupation', delimiter=',')
user_info = genfromtxt('ml-100k/u.user', delimiter='|')

In [22]:
user_info = []
with open('ml-100k/u.user') as csvfile:
    u_user = csv.reader(csvfile, delimiter='|')
    for row in u_user:
        # age | gender | occupation | zip code
        user_info.append([row[1], row[2], row[3]])

In [23]:
def personal_score(u1, u2):
    total = 0
    for i in range(len(user_info[0])):
        if i == 0: #Age
            if abs(int(user_info[u1][i]) - int(user_info[u2][i])) <= (int(user_info[u1][i])//6 + 1):
                total += 1
        else:
            if user_info[u1][i] == user_info[u2][i]:
                total += 1
    return total/30
        

In [24]:
def opt_weights(mat):
    w_mat = mat
    for u in range(len(w_mat[0])):
        for w in range(len(w_mat[0][0])):
            w_mat[0][u][w] += personal_score(u, w_mat[1][u][w])
    return w_mat

In [25]:
opt_knn_mat = opt_weights(knn_mat)

In [26]:
opt_u_r = weight_collapse(sk_mat1, opt_knn_mat)

In [27]:
mae(u_r, test_mat1)

0.9100193635521082

In [28]:
mae(opt_u_r, test_mat1)

0.9100202610952507

# Full Error Calculation

In [29]:
all_sk_mat = [sk_mat1, sk_mat2, sk_mat3, sk_mat4, sk_mat5]
all_test_mat = [test_mat1, test_mat2, test_mat3, test_mat4, test_mat5]

In [30]:
# for sk_mat in all_sk_mat:
#     for i in range(len(sk_mat)):
#         for j in range(len(sk_mat[i])):
#             if sk_mat[i][j] == 0:
#                 sk_mat[i][j] = m_r_avg[str(j+1)][0]

In [31]:
def full_alg(nn, opt=False, p_weight=6):
    '''nn | optimise | '''
    total = 0
    for i in range(len(all_sk_mat)):
        knn_mat = knn(nn, all_sk_mat[i])
        if opt:
            knn_mat = opt_weights(knn_mat)
        weight_collapse(all_sk_mat[i], knn_mat)
        u_r = weight_collapse(all_sk_mat[i], knn_mat)
        total += mae(u_r, all_test_mat[i])
    
    return total/5

In [32]:
print(full_alg(400, False))

0.8931143840450474


In [33]:
print(full_alg(400, True))

0.8931152695516437


# Best in Genre

In [None]:
genres = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [None]:
def best_in_genre(genre):
    g = genres.index(genre)
    best_so_far = []
    for i in m_r_avg:
        if int(m_r_avg[i][1][g]) == 1:
            if len(best_so_far) == 0:
                best_so_far.append(i)
            if m_r_avg[i][0] > m_r_avg[best_so_far[0]][0]:
                best_so_far = [i]
            elif m_r_avg[i][0] == best_so_far[0][0]:
                best_so_far[0].append(i)  
    ret = []
    for i in best_so_far:
        ret.append(m_names[i])
    return ret

# No libraries KNN

In [None]:
def calc_sim(user1, user2):
    '''Calculate similarity between two users'''
    total_sum = 0
    total_common = 0
    for i in u_d[user1]:
        if i in u_d[user2]:
            total_common += 1
            total_sum += (int(u_d[user1][i]) - int(u_d[user2][i]))**2
    sq_o_sum = math.sqrt(total_sum)
    return (1/(1+sq_o_sum), total_common)

In [None]:
def find_nn(user, k):
    sim_list = []
    for i in u_d:
        if i != user:
            sim = calc_sim(user, i)
            if sim[1] >= 8:
                sim_list.append((i, sim))
    sim_list = sorted(sim_list, reverse=True, key=lambda x: x[1])
    return sim_list[:k]

In [None]:
def unseen(user):
    us_m = {}
    nn = find_nn(user, 200)
    for i in nn:
        for rat in u_d[i[0]]:
            if rat not in u_d[user]:
                if i[0] not in us_m:
                    us_m[i[0]] = []
                us_m[i[0]].append([i[1][0], rat, u_d[i[0]][rat]])
    return us_m
    

In [None]:
def rate_unseen(user):
    us_m = unseen(user)
    m_ar = {}
    for i in us_m:
        for j in us_m[i]:
            if j[1] not in m_ar:
                m_ar[j[1]] = []
                w_rat = j[0]*int(j[2])
                m_ar[j[1]].append(w_rat)
        
    m_li = []
    for i in m_ar:
        m_li.append((i, sum(m_ar[i])/len(m_ar[i])))
        m_li = sorted(m_li, reverse=True, key=lambda x: x[1])
    return m_li
        

In [None]:
def recommend(user, amount):
    li = rate_unseen(user)[:amount]
    ret = []
    for mov in li:
        ret.append(m_names[mov[0]])
    return ret

In [None]:
best_in_genre("Adventure")

In [None]:
recommend('1', 10)
