In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

ratings = pd.read_csv('ratings.csv')

In [2]:
def NNSrecommend(inputuser = 1):
    
    """Suggest movies based on NNS algorihm.
    
    Input:
    inputuser       -- "userId" of the selected user
    
    Output:
    recommendations -- "movieId"-s of recommended movies, numpy.ndarray format
    
    Error codes:
    -1              -- "inputuser" not found in database
    -2              -- There are no users, that have rated the same movies
    """
    
    if inputuser not in ratings['userId'].unique():
        return -1

    # Reduction of movies
    seenmovies = ratings[ratings['userId'] == inputuser]['movieId']
    smallratings = ratings[ratings['movieId'].isin(seenmovies)]

    # Reduction of users
    users = smallratings['userId'].value_counts() == len(seenmovies)
    users = users.index[users].tolist()
    smallratings = smallratings[smallratings['userId'].isin(users)]
        
    # Make pivot table and calculate distances
    table = pd.pivot_table(smallratings, values='rating', index='userId', columns='movieId')

    distances = pdist(table, metric='euclidean')
    distances = squareform(distances)
    
    # Select the distances that we need
    distances = pd.DataFrame(index = table.index, columns = table.index, data = distances)
    distances = distances[distances.index == inputuser].drop(inputuser, axis = 1)
    
    if distances.empty:
        return -2
    
    nearestuser = distances.idxmin(axis = 1).values[0]

    recommendations = ratings[(ratings['userId'] == nearestuser) & (ratings['movieId'].isin(seenmovies) == False)
                              & (ratings['rating'] >= 4)]['movieId'].values
    
    return recommendations

In [3]:
rec = NNSrecommend(1)

In [4]:
rec

array([     3,     14,     21, ..., 145839, 146656, 148626], dtype=int64)

# Coverage

In [12]:
suggested_movies = []
users = np.random.choice(ratings['userId'].unique(), size=100)
for i in users:
    suggested_movies.append(NNSrecommend(i))

In [56]:
list = []
for i in range(len(suggested_movies)):
    if type(suggested_movies[i]) == np.ndarray:
        for j in range(len(suggested_movies[i])):
            list.append(suggested_movies[i][j])

In [61]:
len(np.unique(list))

7799

In [64]:
ratings.movieId.nunique()

45115

# Personalization

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
suggested_movies = []
users = np.random.choice(ratings['userId'].unique(), size=100)
for i in users:
    tmp = NNSrecommend(i)
    if type(tmp) != int:
        suggested_movies.append(tmp)

In [34]:
list = []
for i in range(len(suggested_movies)):
    for j in range(len(suggested_movies[i])):
        list.append(suggested_movies[i][j])

list = np.unique(list)

In [61]:
data = np.zeros(shape=(len(suggested_movies), len(list)))
seen_movies = pd.DataFrame(data=data, columns=list)

In [62]:
for i in range(len(suggested_movies)):
    for j in suggested_movies[i]:
        if j in seen_movies.columns:
            seen_movies.loc[i,j] = 1

In [71]:
pd.DataFrame(cosine_similarity(seen_movies))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,1.000000,0.026273,0.072658,0.069304,0.091816,0.169767,0.074331,0.106547,0.081592,0.062357,...,0.055916,0.039382,0.072638,0.050467,0.077872,0.126833,0.018283,0.075556,0.216254,0.088666
1,0.026273,1.000000,0.226157,0.227147,0.255706,0.105927,0.249839,0.083293,0.264080,0.195203,...,0.112977,0.176151,0.179368,0.157281,0.223445,0.201233,0.067891,0.202482,0.066918,0.271626
2,0.072658,0.226157,1.000000,0.985502,0.347765,0.158027,0.318172,0.106703,0.395624,0.966651,...,0.195706,0.229242,0.960043,0.248235,0.981417,0.390736,0.141733,0.975242,0.211911,0.393662
3,0.069304,0.227147,0.985502,1.000000,0.350457,0.163559,0.328935,0.117701,0.404002,0.969657,...,0.204153,0.250127,0.967471,0.260490,0.988750,0.396897,0.160808,0.982292,0.221185,0.402113
4,0.091816,0.255706,0.347765,0.350457,1.000000,0.193264,0.343846,0.135706,0.435042,0.310503,...,0.145352,0.209067,0.312586,0.210502,0.346360,0.384577,0.054505,0.329990,0.204781,0.432637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,0.126833,0.201233,0.390736,0.396897,0.384577,0.170075,0.296978,0.150784,0.377329,0.362989,...,0.133467,0.218788,0.364348,0.170945,0.391781,1.000000,0.059589,0.385412,0.203223,0.370620
82,0.018283,0.067891,0.141733,0.160808,0.054505,0.039553,0.089413,0.065207,0.113965,0.151764,...,0.084993,0.143396,0.133265,0.125750,0.166467,0.059589,1.000000,0.158401,0.055880,0.114557
83,0.075556,0.202482,0.975242,0.982292,0.329990,0.160652,0.316053,0.115246,0.389881,0.962016,...,0.210744,0.248866,0.962858,0.256022,0.980768,0.385412,0.158401,1.000000,0.222215,0.385877
84,0.216254,0.066918,0.211911,0.221185,0.204781,0.191388,0.233711,0.191390,0.312283,0.187632,...,0.256352,0.211555,0.208689,0.156083,0.227911,0.203223,0.055880,0.222215,1.000000,0.301484


In [80]:
# Test values
x = seen_movies.iloc[0]
y = seen_movies.iloc[1]

In [87]:
sum(x * y) / ( sum(x**2)**(1/2) * sum(y**2)**(1/2) )

0.02627336906007949

In [88]:
matrix = pd.DataFrame(cosine_similarity(seen_movies))

In [90]:
matrix = 1-matrix

In [93]:
personalization = []
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[1]):
        if i < j:
            personalization.append(matrix.loc[i,j])

In [98]:
np.mean(personalization)

0.7730875771559563