## Filtrado Colavorativo

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv",sep=",")
df_movies = pd.read_csv("ml-latest-small/movies.csv",sep=",")

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
del df_ratings['timestamp']

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df_ratings = pd.merge(df_ratings, df_movies, on='movieId')[['userId', 'title', 'movieId','rating']]

In [7]:
df_ratings.head()

Unnamed: 0,userId,title,movieId,rating
0,1,Toy Story (1995),1,4.0
1,5,Toy Story (1995),1,4.0
2,7,Toy Story (1995),1,4.5
3,15,Toy Story (1995),1,2.5
4,17,Toy Story (1995),1,4.5


In [8]:
ratings_mtx_df = df_ratings.pivot_table(values='rating', index='userId', columns='title')
ratings_mtx_df.fillna(0, inplace=True)
movie_index = ratings_mtx_df.columns
ratings_mtx_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#correlación de Pearson(PMCC) 
corr_matrix = np.corrcoef(ratings_mtx_df.T)
corr_matrix.shape

(9719, 9719)

In [10]:
favoured_movie_title = 'Batman Forever (1995)'
favoured_movie_index = list(movie_index).index(favoured_movie_title)
P = corr_matrix[favoured_movie_index]


In [11]:
P

array([ 0.07368255, -0.02057755, -0.02912497, ...,  0.07015806,
        0.08287597, -0.02057755])

In [12]:
import operator
list(movie_index[(P>0.4) & (P<1.0)])

['Ace Ventura: Pet Detective (1994)',
 'Addams Family Values (1993)',
 'Aladdin (1992)',
 'Apollo 13 (1995)',
 'Batman (1989)',
 'Clear and Present Danger (1994)',
 'Cliffhanger (1993)',
 'Crimson Tide (1995)',
 'Dances with Wolves (1990)',
 'Die Hard: With a Vengeance (1995)',
 'Dumb & Dumber (Dumb and Dumber) (1994)',
 'GoldenEye (1995)',
 'Interview with the Vampire: The Vampire Chronicles (1994)',
 'Judge Dredd (1995)',
 'Mask, The (1994)',
 'Net, The (1995)',
 'Outbreak (1995)',
 'Star Trek: Generations (1994)',
 'Stargate (1994)',
 'True Lies (1994)',
 'Waterworld (1995)',
 'While You Were Sleeping (1995)']

In [13]:
def get_movie_similarity(movie_title):
    '''Devuelve el vector de correlación para una película'''
    movie_idx = list(movie_index).index(movie_title)
    return corr_matrix[movie_idx]

def get_movie_recommendations(user_movies):
    '''Dado un grupo de películas, devolver las mas similares'''
    movie_similarities = np.zeros(corr_matrix.shape[0])
    for movie_id in user_movies:
        movie_similarities = movie_similarities + get_movie_similarity(movie_id)
    similarities_df = pd.DataFrame({
        'movie_title': movie_index,
        'sum_similarity': movie_similarities
        })
    similarities_df = similarities_df[~(similarities_df.movie_title.isin(user_movies))]
    similarities_df = similarities_df.sort_values(by=['sum_similarity'], ascending=False)
    return similarities_df

In [14]:
sample_user = 21
recomendacion_usuario = df_ratings[df_ratings.userId==sample_user].sort_values(by=['rating'], ascending=False)

In [15]:
recomendacion_usuario[:10]

Unnamed: 0,userId,title,movieId,rating
40048,21,Back to the Future Part II (1989),2011,5.0
52417,21,American Pie (1999),2706,5.0
60137,21,Tomorrow Never Dies (1997),1722,5.0
31377,21,Naked Gun 33 1/3: The Final Insult (1994),370,5.0
10657,21,Back to the Future Part III (1990),2012,5.0
63053,21,"World Is Not Enough, The (1999)",3082,5.0
40405,21,Die Another Day (2002),5872,5.0
8732,21,Back to the Future (1985),1270,5.0
67309,21,Idiocracy (2006),47997,5.0
66932,21,Never Say Never Again (1983),7573,5.0


In [16]:
sample_user = 100
recomendacion_usuario = df_ratings[df_ratings.userId==sample_user].sort_values(by=['rating'], ascending=False)

In [17]:
recomendacion_usuario[:10]

Unnamed: 0,userId,title,movieId,rating
77024,100,"Officer and a Gentleman, An (1982)",4041,5.0
40994,100,Sweet Home Alabama (2002),5620,5.0
61947,100,Christmas Vacation (National Lampoon's Christm...,2423,5.0
87787,100,Terms of Endearment (1983),1958,5.0
33984,100,Top Gun (1986),1101,5.0
37235,100,Catch Me If You Can (2002),5989,4.5
40758,100,When Harry Met Sally... (1989),1307,4.5
40566,100,Mary Poppins (1964),1028,4.5
38429,100,Spider-Man 2 (2004),8636,4.5
34631,100,Notting Hill (1999),2671,4.5


## cosine_similarity

In [18]:
rating = ratings_mtx_df.head()

In [19]:
magnitude = np.sqrt(np.square(rating).sum(axis=1))

In [20]:
rating = rating.divide(magnitude, axis='index')

rating

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059164,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(rating)

# Lets get the top 11 similar artists for Beyonce
data = data_matrix.loc['Batman Forever (1995)'].nlargest(15)

In [22]:
data

title
Ace Ventura: Pet Detective (1994)                            1.0
Addams Family Values (1993)                                  1.0
Apollo 13 (1995)                                             1.0
Babe (1995)                                                  1.0
Batman Forever (1995)                                        1.0
Clueless (1995)                                              1.0
Dead Man Walking (1995)                                      1.0
In the Line of Fire (1993)                                   1.0
Interview with the Vampire: The Vampire Chronicles (1994)    1.0
Legends of the Fall (1994)                                   1.0
Lion King, The (1994)                                        1.0
Little Women (1994)                                          1.0
Once Were Warriors (1994)                                    1.0
Pretty Woman (1990)                                          1.0
Quiz Show (1994)                                             1.0
Name: Batman Foreve

In [23]:
def mean_reciprocal_rank(rs):    
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

def precision_at_k(r, k):    
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

In [25]:
batman_semejantes = get_movie_similarity('Batman Forever (1995)')

In [26]:
scores = list(map(lambda x: 1 if x > 0.45 else 0, sorted(batman_semejantes,reverse=True)))[1:20]
mean_reciprocal_rank(scores)

0.5263157894736842

In [28]:
precision_at_k(sorted(batman_semejantes, reverse=True)[1:],30)

1.0