In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from io import StringIO
import io
from urllib.request import urlopen
import requests

### Loading data

In [18]:
#We check what content the zip of the data has, before downloading and unzip
zip_url ='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
r = requests.get(zip_url)

In [42]:
mlz = ZipFile(io.BytesIO(r.content))

In [43]:
mlz.namelist()

['ml-1m/',
 'ml-1m/movies.dat',
 'ml-1m/ratings.dat',
 'ml-1m/README',
 'ml-1m/users.dat']

In [36]:
#For the collaborative filter-based recommendation algorithm, we're interested in the movies.dat table
mlz.open('ml-1m/movies.dat')

<zipfile.ZipExtFile name='ml-1m/movies.dat' mode='r' compress_type=deflate>

In [44]:
#We load the movie file. movie_genre has the genres of the film separated by ::
movies_df = pd.read_table('data/movies.dat', header=None, sep='::', engine='python', names=['movie_id', 'movie_title', 'movie_genre'])
movies_df.head()

Unnamed: 0,movie_id,movie_title,movie_genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Collaborative Filters-based Recommendation Model🎬👨‍👩‍👧‍👦

This group of algorithms relies on using the description of each product to recommend, without using information from other users to generate the recommendation to the target user.

The get_dummies function converts a categorical variable into multiple columns.  
For each movie, these dummy columns will have a value of 1 when it matches their topic.

In [45]:
# We convert the genero variable into a dummy variable for your treatment
# The get_dummies function converts a categorical variable into multiple columns
# For each movie, these dummy columns will have a value of 0 except for those genres that have the movie
movies_df = pd.concat([movies_df, movies_df.movie_genre.str.get_dummies(sep='|')], axis=1)
movies_df.head()

Unnamed: 0,movie_id,movie_title,movie_genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# A variable is created with the categories
movie_categories = movies_df.columns[3:]

In [47]:
movie_categories

Index(['Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [48]:
# We identify a type user, rating 1-5 the tastes of each of the categories of films
from collections import OrderedDict

user_preferences = OrderedDict(zip(movie_categories, []))

user_preferences['Action'] = 5
user_preferences['Adventure'] = 5
user_preferences['Animation'] = 1
user_preferences["Children's"] = 1
user_preferences["Comedy"] = 3
user_preferences['Crime'] = 2
user_preferences['Documentary'] = 1
user_preferences['Drama'] = 1
user_preferences['Fantasy'] = 5
user_preferences['Film-Noir'] = 1
user_preferences['Horror'] = 2
user_preferences['Musical'] = 1
user_preferences['Mystery'] = 3
user_preferences['Romance'] = 1
user_preferences['Sci-Fi'] = 5
user_preferences['Thriller'] = 3
user_preferences['War'] = 2
user_preferences['Western'] =1

In [49]:
user_preferences

OrderedDict([('Action', 5),
             ('Adventure', 5),
             ('Animation', 1),
             ("Children's", 1),
             ('Comedy', 3),
             ('Crime', 2),
             ('Documentary', 1),
             ('Drama', 1),
             ('Fantasy', 5),
             ('Film-Noir', 1),
             ('Horror', 2),
             ('Musical', 1),
             ('Mystery', 3),
             ('Romance', 1),
             ('Sci-Fi', 5),
             ('Thriller', 3),
             ('War', 2),
             ('Western', 1)])

In [52]:
#Function to make the vector product of two vectors
def dot_product(vector_1, vector_2):
    return sum([ i*j for i,j in zip(vector_1, vector_2)])

##Función to make the vector product of a movie and a user
def get_movie_score(movie_features, user_preferences):
    return dot_product(movie_features, user_preferences)

In [50]:
# We get the genres of for example the first film, Toy Story
toy_story_features = movies_df.loc[0][movie_categories]
toy_story_features

Action         0
Adventure      0
Animation      1
Children's     1
Comedy         1
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: 0, dtype: object

In [53]:
# We calculate the score of the film against the user through the vector product
toy_story_user_predicted_score = dot_product(toy_story_features, user_preferences.values())
toy_story_user_predicted_score

5

In [54]:
# Now we're looking for an action movie and we see what a score it has to compare to that of Toy Story
# For example the film movie_id=1036 Die Hard
die_hard_id = 1036
movies_df[movies_df.movie_id==die_hard_id]
 

Unnamed: 0,movie_id,movie_title,movie_genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1023,1036,Die Hard (1988),Action|Thriller,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [58]:
# We get the dummies variables from their categories
die_hard_features = movies_df[movies_df.movie_id==die_hard_id][movie_categories]
#Transpose index and columns
die_hard_features.T

Unnamed: 0,1023
Action,1
Adventure,0
Animation,0
Children's,0
Comedy,0
Crime,0
Documentary,0
Drama,0
Fantasy,0
Film-Noir,0


In [59]:
# We calculate the score of the film against the user through the vector product
die_hard_user_predicted_score = dot_product(die_hard_features.values[0], user_preferences.values())
die_hard_user_predicted_score

8

In [70]:
#We test that for the defined user a df is formed that categorizes all movies according to their preferences
#movies_df[movie_categories]
movies_df[movie_categories].apply(get_movie_score, args=([user_preferences.values()]), axis=1).sort_values(ascending=False)

2559    20
2036    20
1197    20
2253    20
257     20
        ..
3613     1
777      1
2363     1
2364     1
1474     1
Length: 3883, dtype: int64

In [66]:
#We define a function that calculates the recommendation value of all movies in the dataset for a user 
#and recommends the best movies 
def get_movie_recommendations(user_preferences, n_recommendations):
    #We create a new column in the dataset with the value of each movie for the user
    movies_df['score'] = movies_df[movie_categories].apply(get_movie_score, 
                                                           args=([user_preferences.values()]), axis=1)
    return movies_df.sort_values(by=['score'], ascending=False)['movie_title'][:n_recommendations]

In [67]:
#We now test with the preferences of our model user
get_movie_recommendations(user_preferences, 10)

2253                                       Soldier (1998)
257             Star Wars: Episode IV - A New Hope (1977)
2036                                          Tron (1982)
1197                              Army of Darkness (1993)
2559     Star Wars: Episode I - The Phantom Menace (1999)
1985                      Honey, I Shrunk the Kids (1989)
1192    Star Wars: Episode VI - Return of the Jedi (1983)
1111                                    Abyss, The (1989)
1848                                    Armageddon (1998)
2847                                  Total Recall (1990)
Name: movie_title, dtype: object

## Content-based Recommendation Model 🎬📽 
This group of algorithms relies on using the description of each product to recommend, without using information from other users to generate the recommendation to the target user.

- We load the file with movie scores, **ratings.dat**  
- We replace the id of the movie with its title for greater clarity

In [2]:
#We load the movie file. movie_genre has the genres of the film separated by ::
movies_df = pd.read_table('data/movies.dat', header=None, sep='::', engine='python', names=['movie_id', 'movie_title', 'movie_genre'])
movies_df.head()

Unnamed: 0,movie_id,movie_title,movie_genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#load the rantings file, is a set of users with the movies they've seen and their punuation
ratings = pd.read_table('data/ratings.dat', header=None, sep='::', engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp'])

#We deleted the date the rating was created
del ratings ['timestamp']

#We added the title of the film
ratings = pd.merge(ratings, movies_df, on='movie_id')[['user_id', 'movie_title', 'movie_id','rating']]

ratings.head()

Unnamed: 0,user_id,movie_title,movie_id,rating
0,1,One Flew Over the Cuckoo's Nest (1975),1193,5
1,2,One Flew Over the Cuckoo's Nest (1975),1193,5
2,12,One Flew Over the Cuckoo's Nest (1975),1193,4
3,15,One Flew Over the Cuckoo's Nest (1975),1193,4
4,17,One Flew Over the Cuckoo's Nest (1975),1193,5


In [4]:
# We create a new matrix with the ratios of each user for all movies
ratings_matriz = ratings.pivot_table(values='rating', index='user_id', columns='movie_title')

# We fill with 0 in the Nan values
ratings_matriz.fillna(0, inplace=True)

movie_index = ratings_matriz.columns

ratings_matriz.head()

movie_title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
movie_index

Index(['$1,000,000 Duck (1971)', ''Night Mother (1986)',
       ''Til There Was You (1997)', ''burbs, The (1989)',
       '...And Justice for All (1979)', '1-900 (1994)',
       '10 Things I Hate About You (1999)', '101 Dalmatians (1961)',
       '101 Dalmatians (1996)', '12 Angry Men (1957)',
       ...
       'Young Poisoner's Handbook, The (1995)', 'Young Sherlock Holmes (1985)',
       'Young and Innocent (1937)', 'Your Friends and Neighbors (1998)',
       'Zachariah (1971)', 'Zed & Two Noughts, A (1985)', 'Zero Effect (1998)',
       'Zero Kelvin (Kjærlighetens kjøtere) (1995)', 'Zeus and Roxanne (1997)',
       'eXistenZ (1999)'],
      dtype='object', name='movie_title', length=3706)

In [95]:
#the position in Toy Story's movie listing
list(movie_index).index('Toy Story (1995)')

3411

We calculate the correlation (Pearson correlation coefficient(PMCC)) between movies based on user score.  

PMCC has a value between -1 and 1 that measures how related a couple of quantitative variables are.

In [88]:
corr_matrix = np.corrcoef(ratings_matriz.T)
corr_matrix.shape

(3706, 3706)

Let's see which movies would be most related to Toy Story, the first movie on the list.

In [89]:
movies_df['movie_title'][0]

'Toy Story (1995)'

We show the most toy story-related movies based on their correlation value.

In [97]:
favorit_movie = 'Toy Story (1995)'

favorita_movie_index = list(movie_index).index(favorit_movie)

P = corr_matrix[favorita_movie_index]

list(movie_index[(P>0.4) & (P<1.0)])

['Aladdin (1992)',
 "Bug's Life, A (1998)",
 'Groundhog Day (1993)',
 'Lion King, The (1994)',
 'Toy Story 2 (1999)']

In [98]:
#The Toy Story film's correlation list with each user
P

array([0.05174442, 0.04210762, 0.04271477, ..., 0.01268849, 0.04293014,
       0.08088685])

Let's factor this process into a couple of functions to recommend movies to a certain user by the correlation matrix of all movies.

In [13]:
#A function that returns the correlation vector for a movie
def get_similar_movie(movie):
    corr_matrix = np.corrcoef(ratings_matriz.T)
    movie_idx = list(movie_index).index(movie)
    return corr_matrix[movie_idx]

#We return movies that are more similar to the tastes of a model user.
#If we want to recommend movies to a user, we get the list of movies they've watched and add up the correlations
#of those movies with all the others to return the movies with a greater total correlation..
def get_movie_recomendations(user):
    corr_matrix = np.corrcoef(ratings_matriz.T)
    similar_movies = np.zeros(corr_matrix.shape[0])
    for movie_id in user:
        similar_movies = similar_movies + get_similar_movie(movie_id)
        similars_df = pd.DataFrame({
        'Titulo': movie_index,
        'sum_similar': similar_movies
        })
    similars_df = similars_df[~(similars_df.Titulo.isin(user))]
    similars_df = similars_df.sort_values(by=['sum_similar'], ascending=False)
    return similars_df

In [9]:
#We're going to recommend movies to user 21, who has seen the following movies with their corresponding rating
user_21 = 21
ratings[ratings.user_id==user_21].sort_values(by=['rating'], ascending=False)

Unnamed: 0,user_id,movie_title,movie_id,rating
583304,21,Titan A.E. (2000),3745,5
707307,21,"Princess Mononoke, The (Mononoke Hime) (1997)",3000,5
70742,21,Star Wars: Episode VI - Return of the Jedi (1983),1210,5
239644,21,"South Park: Bigger, Longer and Uncut (1999)",2700,5
487530,21,Mad Max Beyond Thunderdome (1985),3704,4
707652,21,Little Nemo: Adventures in Slumberland (1992),2800,4
708015,21,Stop! Or My Mom Will Shoot (1992),3268,3
706889,21,"Brady Bunch Movie, The (1995)",585,3
623947,21,"Iron Giant, The (1999)",2761,3
619784,21,Wild Wild West (1999),2701,3


In [10]:
#Generamos un listado de títulos de películas del usuario 21 para poder añadirlo a las funciones.
#The df ratings is taken and the column movie_title.
user_film_21_list = ratings[ratings.user_id==user_21].movie_title.tolist()
user_film_21_list

["Bug's Life, A (1998)",
 'Bambi (1942)',
 'Antz (1998)',
 'Aladdin (1992)',
 'Toy Story (1995)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Who Framed Roger Rabbit? (1988)',
 'South Park: Bigger, Longer and Uncut (1999)',
 'Akira (1988)',
 'Pinocchio (1940)',
 'Mad Max Beyond Thunderdome (1985)',
 'Titan A.E. (2000)',
 "Devil's Advocate, The (1997)",
 'Prince of Egypt, The (1998)',
 'Wild Wild West (1999)',
 'Iron Giant, The (1999)',
 'Brady Bunch Movie, The (1995)',
 'Princess Mononoke, The (Mononoke Hime) (1997)',
 'Little Nemo: Adventures in Slumberland (1992)',
 'Messenger: The Story of Joan of Arc, The (1999)',
 'Stop! Or My Mom Will Shoot (1992)',
 'House Party 2 (1991)']

We send to the function **get_movie_recomendations** the list of movies that the 21 user has seen to give us the list of recommended movies (user_film_21_list).  

We get the top 20 movies that will be the movies with the highest correlation to the movies that user 21 has already seen.

In [14]:
recomendations_21_user = get_movie_recomendations(user_film_21_list)
recomendations_21_user.Titulo.head(20)

1939                     Lion King, The (1994)
324                Beauty and the Beast (1991)
1948                Little Mermaid, The (1989)
3055    Snow White and the Seven Dwarfs (1937)
647                     Charlotte's Web (1973)
679                          Cinderella (1950)
1002                              Dumbo (1941)
301                              Batman (1989)
3250            Sword in the Stone, The (1963)
303                      Batman Returns (1992)
2252                              Mulan (1998)
2924                Secret of NIMH, The (1982)
2808                         Robin Hood (1973)
3026                    Sleeping Beauty (1959)
1781                   Jungle Book, The (1967)
260         Back to the Future Part III (1990)
259          Back to the Future Part II (1989)
2558                          Peter Pan (1953)
2347             NeverEnding Story, The (1984)
97                  Alice in Wonderland (1951)
Name: Titulo, dtype: object