# load movie ratings data and filtering

In [1]:
import os
import pandas as pd

rating_file_path = os.getenv("HOME")+"/github/aiffel_practice/EXPLORATION08/data/ml-1m/ratings.dat"
ratings_cols = ["user_id", "movie_id", "ratings", "timestamp"]
ratings = pd.read_csv(rating_file_path,sep="::", names=ratings_cols)
original_data_size = len(ratings)
ratings.head()

  


Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   ratings    1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [3]:
ratings[ratings["ratings"] >= 3]

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [4]:
ratings = ratings[ratings["ratings"] >= 3]
filtered_data_size = len(ratings)
print(f"original_data_size : {original_data_size}, filtered_data_size : {filtered_data_size}")
print(f"ratio of remaining Data is {filtered_data_size/original_data_size:.2%}")

original_data_size : 1000209, filtered_data_size : 836478
ratio of remaining Data is 83.63%


In [5]:
ratings.rename(columns={"ratings":"count"}, inplace=True)

In [6]:
ratings

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


# load movie title data and find most popular movies

In [7]:
movie_file_path = rating_file_path = os.getenv("HOME")+"/github/aiffel_practice/EXPLORATION08/data/ml-1m/movies.dat"
cols = ["movie_id", "title", "genre"]
movies = pd.read_csv(movie_file_path, sep="::", names=cols)
movies.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
num_movies = ratings["movie_id"].nunique()
num_users = ratings["user_id"].nunique()
print("num of movies : ", num_movies)
print("num of users : ", num_users)

movie_ratings = ratings.groupby("movie_id")["count"].mean()

print("sorted movie mean ratings")
sorted_movie = movie_ratings.sort_values(ascending=False)
print(sorted_movie)

num of movies :  3628
num of users :  6039
sorted movie mean ratings
movie_id
1830    5.0
3233    5.0
3800    5.0
1553    5.0
787     5.0
       ... 
827     3.0
1548    3.0
607     3.0
821     3.0
1070    3.0
Name: count, Length: 3628, dtype: float64


# change movie_id to movies'name in sorted_movies

In [9]:
movie_title = movies.iloc[:,0:2]
movie_title.info()
movie_title.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
dtypes: int64(1), object(1)
memory usage: 60.8+ KB


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


# make dictionary for id2title and title2id

In [10]:
id_title = {}
for index, row in movie_title.iterrows():
    id_title[row[0]] = row[1].lower()
print(id_title)

title_id = {item:key for key, item in id_title.items()}
title_id

{1: 'toy story (1995)', 2: 'jumanji (1995)', 3: 'grumpier old men (1995)', 4: 'waiting to exhale (1995)', 5: 'father of the bride part ii (1995)', 6: 'heat (1995)', 7: 'sabrina (1995)', 8: 'tom and huck (1995)', 9: 'sudden death (1995)', 10: 'goldeneye (1995)', 11: 'american president, the (1995)', 12: 'dracula: dead and loving it (1995)', 13: 'balto (1995)', 14: 'nixon (1995)', 15: 'cutthroat island (1995)', 16: 'casino (1995)', 17: 'sense and sensibility (1995)', 18: 'four rooms (1995)', 19: 'ace ventura: when nature calls (1995)', 20: 'money train (1995)', 21: 'get shorty (1995)', 22: 'copycat (1995)', 23: 'assassins (1995)', 24: 'powder (1995)', 25: 'leaving las vegas (1995)', 26: 'othello (1995)', 27: 'now and then (1995)', 28: 'persuasion (1995)', 29: 'city of lost children, the (1995)', 30: 'shanghai triad (yao a yao yao dao waipo qiao) (1995)', 31: 'dangerous minds (1995)', 32: 'twelve monkeys (1995)', 33: 'wings of courage (1995)', 34: 'babe (1995)', 35: 'carrington (1995)', 3

{'toy story (1995)': 1,
 'jumanji (1995)': 2,
 'grumpier old men (1995)': 3,
 'waiting to exhale (1995)': 4,
 'father of the bride part ii (1995)': 5,
 'heat (1995)': 6,
 'sabrina (1995)': 7,
 'tom and huck (1995)': 8,
 'sudden death (1995)': 9,
 'goldeneye (1995)': 10,
 'american president, the (1995)': 11,
 'dracula: dead and loving it (1995)': 12,
 'balto (1995)': 13,
 'nixon (1995)': 14,
 'cutthroat island (1995)': 15,
 'casino (1995)': 16,
 'sense and sensibility (1995)': 17,
 'four rooms (1995)': 18,
 'ace ventura: when nature calls (1995)': 19,
 'money train (1995)': 20,
 'get shorty (1995)': 21,
 'copycat (1995)': 22,
 'assassins (1995)': 23,
 'powder (1995)': 24,
 'leaving las vegas (1995)': 25,
 'othello (1995)': 26,
 'now and then (1995)': 27,
 'persuasion (1995)': 28,
 'city of lost children, the (1995)': 29,
 'shanghai triad (yao a yao yao dao waipo qiao) (1995)': 30,
 'dangerous minds (1995)': 31,
 'twelve monkeys (1995)': 32,
 'wings of courage (1995)': 33,
 'babe (1995)

In [11]:
title_ratings = sorted_movie.rename(index=id_title).rename_axis("title")
title_ratings

title
follow the bitch (1998)                          5.0
smashing time (1967)                             5.0
criminal lovers (les amants criminels) (1999)    5.0
late bloomers (1996)                             5.0
gate of heavenly peace, the (1995)               5.0
                                                ... 
convent, the (convento, o) (1995)                3.0
war at home, the (1996)                          3.0
century (1993)                                   3.0
crude oasis, the (1995)                          3.0
macao (1952)                                     3.0
Name: count, Length: 3628, dtype: float64

# search my favorite movies and rating for it!

In [12]:
search_words = ["story", "star", "die hard", "mask", "apes"]

for key, item in id_title.items():
    for word in search_words:
        if word in item:
            print(key, item)

1 toy story (1995)
124 star maker, the (uomo delle stelle, l') (1995)
126 neverending story iii, the (1994)
131 frankie starlight (1995)
165 die hard: with a vengeance (1995)
197 stars fell on henrietta, the (1995)
260 star wars: episode iv - a new hope (1977)
295 pyromaniac's love story, a (1995)
316 stargate (1994)
329 star trek: generations (1994)
367 mask, the (1994)
800 lone star (1996)
844 story of xinghua, the (1993)
876 police story 4: project s (chao ji ji hua) (1993)
898 philadelphia story, the (1940)
1036 die hard (1988)
1038 unhook the stars (1996)
1140 entertaining angels: the dorothy day story (1996)
1196 star wars: episode v - the empire strikes back (1980)
1210 star wars: episode vi - return of the jedi (1983)
1356 star trek: first contact (1996)
1370 die hard 2 (1990)
1371 star trek: the motion picture (1979)
1372 star trek vi: the undiscovered country (1991)
1373 star trek v: the final frontier (1989)
1374 star trek: the wrath of khan (1982)
1375 star trek iii: the se

In [13]:
my_id = 6041
favorite_title_ratings = {
        'Toy Story (1995)' : 3, 
        'Jumanji (1995)': 3, 
        'Star Wars: Episode IV - A New Hope (1977)' : 4,
        "star wars: episode v - the empire strikes back (1980)" : 5,
        "star wars: episode vi - return of the jedi (1983)" : 4,
        'apollo 13 (1995)' : 4,
        "die hard: with a vengeance (1995)" : 5,
        "die hard (1988)" : 4,
        "die hard 2 (1990)" : 4,
        "planet of the apes (1968)" : 5,
        "beneath the planet of the apes (1970)" : 5,
        "star wars: episode i - the phantom menace (1999)" : 4
        }

In [14]:
favorite_id_ratings = {}

for key, item in favorite_title_ratings.items():
    favorite_id_ratings[title_id[key.lower()]] = item
print(favorite_id_ratings)

{1: 3, 2: 3, 260: 4, 1196: 5, 1210: 4, 150: 4, 165: 5, 1036: 4, 1370: 4, 2529: 5, 2530: 5, 2628: 4}


# append my favorite movies to ratings dataframe

In [15]:
import time
my_id = 6041
for key,item in favorite_id_ratings.items():
    row = dict()
    row["user_id"] = my_id
    row["movie_id"] = key
    row["count"] = item
    row["timestamp"] = int(time.time())
    print(row)
    ratings = ratings.append(row, ignore_index=True)
ratings.tail(20)

{'user_id': 6041, 'movie_id': 1, 'count': 3, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 2, 'count': 3, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 260, 'count': 4, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 1196, 'count': 5, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 1210, 'count': 4, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 150, 'count': 4, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 165, 'count': 5, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 1036, 'count': 4, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 1370, 'count': 4, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 2529, 'count': 5, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 2530, 'count': 5, 'timestamp': 1611817192}
{'user_id': 6041, 'movie_id': 2628, 'count': 4, 'timestamp': 1611817192}


Unnamed: 0,user_id,movie_id,count,timestamp
836470,6040,2028,5,956704519
836471,6040,1080,4,957717322
836472,6040,1089,4,956704996
836473,6040,1090,3,956715518
836474,6040,1094,5,956704887
836475,6040,562,5,956704746
836476,6040,1096,4,956715648
836477,6040,1097,4,956715569
836478,6041,1,3,1611817192
836479,6041,2,3,1611817192


# let's prepare compressed sparse matrix and fit it!

In [16]:
n_users = ratings["user_id"].nunique()
n_movies = ratings["movie_id"].nunique()
print("num of users : ", n_users)
print("num of movies : ", n_movies)
print(ratings["user_id"])
ratings.info()

num of users :  6040
num of movies :  3628
0            1
1            1
2            1
3            1
4            1
          ... 
836485    6041
836486    6041
836487    6041
836488    6041
836489    6041
Name: user_id, Length: 836490, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836490 entries, 0 to 836489
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836490 non-null  int64
 1   movie_id   836490 non-null  int64
 2   count      836490 non-null  int64
 3   timestamp  836490 non-null  int64
dtypes: int64(4)
memory usage: 25.5 MB


In [17]:
from scipy.sparse import csc_matrix

csr_ratings = csc_matrix((ratings["count"], 
                           (ratings["user_id"], ratings["movie_id"])))
print(csr_ratings.shape)
print(csr_ratings)


(6042, 3953)
  (1, 1)	5
  (6, 1)	4
  (8, 1)	4
  (9, 1)	5
  (10, 1)	5
  (18, 1)	4
  (19, 1)	5
  (21, 1)	3
  (23, 1)	4
  (26, 1)	3
  (28, 1)	3
  (34, 1)	5
  (36, 1)	5
  (38, 1)	5
  (44, 1)	5
  (45, 1)	4
  (48, 1)	4
  (49, 1)	5
  (51, 1)	5
  (56, 1)	5
  (60, 1)	4
  (65, 1)	5
  (68, 1)	3
  (73, 1)	3
  (75, 1)	5
  :	:
  (4751, 3952)	4
  (4790, 3952)	3
  (4802, 3952)	5
  (4816, 3952)	4
  (4823, 3952)	3
  (4831, 3952)	4
  (4834, 3952)	4
  (4858, 3952)	4
  (4939, 3952)	3
  (5049, 3952)	4
  (5074, 3952)	4
  (5087, 3952)	4
  (5100, 3952)	4
  (5205, 3952)	4
  (5304, 3952)	4
  (5333, 3952)	4
  (5359, 3952)	5
  (5405, 3952)	4
  (5475, 3952)	5
  (5602, 3952)	3
  (5682, 3952)	3
  (5812, 3952)	4
  (5831, 3952)	3
  (5837, 3952)	4
  (5998, 3952)	4


In [18]:
from implicit.als import AlternatingLeastSquares
import numpy as np
os.environ["OPENBLAS_NUM_THREADS"]="1"
os.environ["KMP_DUPLICATE_LIB_OK"]="True"
os.environ["MKL_NUM_THREADS"]="1"

In [19]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01,
                                   use_gpu=False, iterations=15,
                                   dtype=np.float32)
csr_ratings_transpose = csr_ratings.T
csr_ratings_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836490 stored elements in Compressed Sparse Row format>

In [20]:
als_model.fit(csr_ratings_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# how much model can predict my preference of the movies
- in case of "toy story", i got a 0.37 points. but it doesn't seems looks good before comparing other movies.
- when i check my preference with love story, the model gave very poor points -0.00255
- this quite make sense because i don't like any romantic movies.

In [21]:
my_id, toy_story = 6041, title_id['Toy Story (1995)'.lower()]
my_vector, toy_story_vector = als_model.user_factors[my_id], als_model.item_factors[toy_story]
print(my_vector[:20])
print(toy_story_vector[:20])

[-1.4990188  -1.0219709  -0.02236764  0.2354831  -0.09320052 -0.9870749
 -0.14818364 -0.49465892  0.66265804 -0.87360334 -0.66472244  0.1171287
  0.26774102  0.40803337  0.07580981 -1.2760525  -0.8689597  -0.7640748
  1.4476918   0.8876018 ]
[-0.05072601 -0.01986537  0.00423385  0.01853906  0.04747562 -0.02147201
  0.00166034 -0.01409743 -0.02143396 -0.00285013  0.01907512  0.00469338
 -0.00049429 -0.00686765 -0.02648191  0.02330741 -0.00927056 -0.0067282
  0.009334    0.03106967]


In [22]:
np.dot(my_vector, toy_story_vector)

0.3604326

In [23]:
planet_apes = title_id["escape from the planet of the apes (1971)".lower()]
planet_apes_vector = als_model.item_factors[planet_apes]
np.dot(my_vector, planet_apes_vector)

0.26923147

In [24]:
shopping = title_id['shopping (1994)'.lower()]
shopping_vector = als_model.item_factors[shopping]
np.dot(my_vector, shopping_vector)

0.0023703463

In [25]:
love = title_id["pyromaniac's love story, a (1995)".lower()]
love_vector = als_model.item_factors[love]
np.dot(my_vector, love_vector)

-0.006573396

# Let's find simliar movies with my favorite one
- I love Jumanji, so i wanna find some similar movies like this.
- with my models i got a some of that, but i'm not sure these are really similar ones.
- because i nerver watched most of these movies.
- but when i set the movie "die hard", the model finds similar action movies like "terminator", "indiana jones".
- these kind of movies i aleady wathced, so i can be sure this model is good.

In [26]:
def find_similar_with_movie(favorite_movie):
    favorite_movie_id = title_id[favorite_movie.lower()]
    similar_movies = als_model.similar_items(favorite_movie_id, N=10)


    print("my favorit movie is ", favorite_movie)
    print("-------------------------")
    for idx, item in enumerate(similar_movies):
        if idx == 0:
            continue
        if idx == 1:
            print("the most similar movie is ")
        elif idx == 2:
            print("\nanother similar movie is")
        movie_title = id_title[item[0]]
        similarity = item[1]
        print("\t", movie_title, ", similarity : {0:.3f}".format(similarity))
    
find_similar_with_movie('Jumanji (1995)')

my favorit movie is  Jumanji (1995)
-------------------------
the most similar movie is 
	 hook (1991) , similarity : 0.820

another similar movie is
	 indian in the cupboard, the (1995) , similarity : 0.766
	 dragonheart (1996) , similarity : 0.742
	 flubber (1997) , similarity : 0.689
	 santa clause, the (1994) , similarity : 0.680
	 space jam (1996) , similarity : 0.641
	 neverending story ii: the next chapter, the (1990) , similarity : 0.639
	 small soldiers (1998) , similarity : 0.636
	 borrowers, the (1997) , similarity : 0.616


In [27]:
find_similar_with_movie("die hard (1988)")

my favorit movie is  die hard (1988)
-------------------------
the most similar movie is 
	 terminator, the (1984) , similarity : 0.695

another similar movie is
	 indiana jones and the last crusade (1989) , similarity : 0.685
	 untouchables, the (1987) , similarity : 0.587
	 fugitive, the (1993) , similarity : 0.578
	 raiders of the lost ark (1981) , similarity : 0.526
	 aliens (1986) , similarity : 0.498
	 hunt for red october, the (1990) , similarity : 0.497
	 rocky (1976) , similarity : 0.488
	 lethal weapon (1987) , similarity : 0.468


# find recommended movies for me!

- the most recommended movie for me is "back to the future"
- this is the one i love too!
- another movies likes "monty python", "alien" are also i wanna watch.
- i think this recommendatation model works pretty well!

In [28]:
def recommend_movie(my_id=my_id, csr_matrix=csr_ratings, N=10, filter_already_linked_itmes=True):
    
    movie_recommended = als_model.recommend(my_id, csr_matrix, N=N,
                                filter_already_liked_items=filter_already_linked_itmes)


    print("recommend movies for me!")
    print("-------------------------")
    for idx, item in enumerate(movie_recommended):
        if idx == 0:
            continue
        if idx == 1:
            print("the most recommended movie is ")
        elif idx == 2:
            print("\nanother recommended movie is")
        movie_title = id_title[item[0]]
        similarity = item[1]
        print("\t", movie_title, ", recommendation : {0:.3f}".format(similarity))
    

recommend_movie(my_id=10, csr_matrix=csr_ratings,N=10, filter_already_linked_itmes=True)

recommend movies for me!
-------------------------
the most recommended movie is 
	 mary poppins (1964) , recommendation : 1.289

another recommended movie is
	 absent minded professor, the (1961) , recommendation : 1.283
	 monty python and the holy grail (1974) , recommendation : 1.246
	 groundhog day (1993) , recommendation : 1.237
	 men in black (1997) , recommendation : 1.228
	 lady and the tramp (1955) , recommendation : 1.227
	 amadeus (1984) , recommendation : 1.221
	 chicken run (2000) , recommendation : 1.219
	 alien (1979) , recommendation : 1.205
