## Item based collaborative filtering

In [50]:
import os

import pandas as pd
import numpy as np

# from tqdm import tqdm_notebook

In [51]:
os.chdir(path="/Users/velo1/SynologyDrive/GIT_syno/data/MovieLens _ml-latest-small")
os.getcwd()
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [52]:
movies.title.nunique()

9123

In [53]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [54]:
movies_with_ratings = movies.join(
    ratings.set_index("movieId"), on="movieId", how="left"
).reset_index(drop=True)

In [55]:
films_with_no_ratings = movies_with_ratings[movies_with_ratings['rating'].isna()]
films_with_no_ratings['title'], films_with_no_ratings.shape


(71460           Wild Child, The (Enfant sauvage, L') (1970)
 71671                  Iron Ladies, The (Satree lek) (2000)
 73986                                 Scarlet Street (1945)
 74292                                  Body and Soul (1947)
 77028                 Story of O, The (Histoire d'O) (1975)
 77706                              Two-Lane Blacktop (1971)
 78727                             You Only Live Once (1937)
 79262                                           Fire (1996)
 79692                                    Sudden Fear (1952)
 81324     Intolerance: Love's Struggle Throughout the Ag...
 81578                         Pickup on South Street (1953)
 82648                          While the City Sleeps (1956)
 83104                            Cincinnati Kid, The (1965)
 83747                                    Black Angel (1946)
 83748                                 Big Clock, The (1948)
 84831                                  Stranger, The (1946)
 86877                  

In [56]:
movies_with_ratings.reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.shape

(100004, 6)

In [57]:
films_no_rates = movies.title.nunique()-movies_with_ratings.groupby("title").size().shape[0]
films_no_rates

59

In [46]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,3.0,851866700.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9.0,4.0,938629200.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13.0,5.0,1331380000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.0,997938300.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19.0,3.0,855190100.0


In [58]:
num_users = movies_with_ratings.userId.nunique()
num_users

671

In [59]:
movie_vector = {}

for movie, group in movies_with_ratings.groupby("title"):
    # initialize the vector with zeros for each film title
    movie_vector[movie] = np.zeros(num_users)

    for i in range(len(group.userId.values)):
        # get the user index for each rating
        u = group.userId.values[i]  # user index
        r = group.rating.values[i]  # which rating user 'u' gave to movie 'title'
        movie_vector[movie][int(u - 1)] = r
movie_vector["Toy Story (1995)"].shape, len(movie_vector)

((671,), 9064)

In [30]:
from scipy.spatial.distance import (
    cityblock,
    cosine,
    euclidean,
    hamming,
    jaccard,
    correlation,
)

In [31]:
movie_vector.keys()



In [14]:
my_fav_film = "Fight Club (1999)"

titles = []
distances = []

for key in movie_vector.keys():
    if key != my_fav_film:
        titles.append(key)
        distances.append(correlation(movie_vector[my_fav_film], movie_vector[key]))

best_indexes = np.argsort(distances)[:10]
best_movies = [(titles[i], distances[i]) for i in best_indexes]

for m in best_movies:
    print(m)

('Memento (2000)', 0.44821852421045094)
('Snatch (2000)', 0.4688401152068321)
('Matrix, The (1999)', 0.5096430196723449)
('American History X (1998)', 0.5140028381513735)
('Kill Bill: Vol. 2 (2004)', 0.5279823271731231)
('Gladiator (2000)', 0.5395281806512061)
('Lord of the Rings: The Fellowship of the Ring, The (2001)', 0.5453568664049451)
('Sin City (2005)', 0.5467379177836235)
('Kill Bill: Vol. 1 (2003)', 0.5472630201338977)
('American Beauty (1999)', 0.5662768086371339)


In [36]:
my_fav_film = "Fight Club (1999)"

titles = []
distances = []

for key in movie_vector.keys():
    if key != my_fav_film:
        titles.append(key)
        distances.append(jaccard(movie_vector[my_fav_film], movie_vector[key]))

best_indexes = np.argsort(distances)[:10]
best_movies = [(titles[i], distances[i]) for i in best_indexes]

for m in best_movies:
    print(f"{m[0]:<60} jaccard: {m[1]:.3f}")

Matrix, The (1999)                                           jaccard: 0.816
American Beauty (1999)                                       jaccard: 0.832
Memento (2000)                                               jaccard: 0.850
Dark Knight, The (2008)                                      jaccard: 0.852
American History X (1998)                                    jaccard: 0.860
Donnie Darko (2001)                                          jaccard: 0.867
Lord of the Rings: The Fellowship of the Ring, The (2001)    jaccard: 0.869
Pulp Fiction (1994)                                          jaccard: 0.869
Gladiator (2000)                                             jaccard: 0.873
Reservoir Dogs (1992)                                        jaccard: 0.878


In [39]:
my_fav_film = "Toy Story (1995)"

titles = []
distances = []

for key in movie_vector.keys():
    if key != my_fav_film:
        titles.append(key)
        distances.append(cosine(movie_vector[my_fav_film], movie_vector[key]))

best_indexes = np.argsort(distances)[:10]
best_movies = [(titles[i], distances[i]) for i in best_indexes]

for m in best_movies:
    print(f"{m[0]:<60} Cosine distance: {m[1]:.3f}")
    # print(m[0], f"\tCosine distance: {m[1]:.3f}")

Toy Story 2 (1999)                                           Cosine distance: 0.405
Star Wars: Episode IV - A New Hope (1977)                    Cosine distance: 0.424
Forrest Gump (1994)                                          Cosine distance: 0.435
Independence Day (a.k.a. ID4) (1996)                         Cosine distance: 0.437
Groundhog Day (1993)                                         Cosine distance: 0.452
Back to the Future (1985)                                    Cosine distance: 0.463
Jurassic Park (1993)                                         Cosine distance: 0.465
Shrek (2001)                                                 Cosine distance: 0.467
Star Wars: Episode VI - Return of the Jedi (1983)            Cosine distance: 0.471
Pulp Fiction (1994)                                          Cosine distance: 0.473
