<a href="https://colab.research.google.com/github/JMeghana/Data_Science-Bootcamp-ML/blob/master/RecSysUsingHighSchoolMath.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Code partly inspired from Google's opensource codebase.
# Source: https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems#scrollTo=O3bcgduFo4s6
# This code is written for ease of understanding and not optimally from an engineering standpoint

In [None]:
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
import zipfile


#Downloading movielens data...
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()

In [None]:
!ls -lrt

total 4820
drwxr-xr-x 1 root root    4096 Sep 26 13:45 sample_data
-rw-r--r-- 1 root root 4924029 Sep 29 11:42 movielens.zip
drwxr-xr-x 2 root root    4096 Sep 29 11:42 ml-100k


In [None]:
# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')


In [None]:
# ratings matrix
print(ratings.shape)

(100000, 4)


In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# For each movie, get a set of users who rated it highly (4 or 5)
filtered_ratings = ratings[ratings['rating']>=4]
print(filtered_ratings.shape)

gp = filtered_ratings.groupby(['movie_id']).agg({'user_id': lambda x: set(x)}).reset_index() # reset_index used to make movie_id as a column instead of a Pandas index
print(gp)

(55375, 4)
      movie_id                                            user_id
0            1  {1, 2, 5, 6, 10, 16, 17, 18, 21, 23, 25, 38, 4...
1            2  {256, 640, 130, 387, 642, 648, 393, 276, 532, ...
2            3  {1, 130, 267, 523, 534, 663, 793, 923, 417, 55...
3            4  {514, 7, 10, 12, 13, 524, 16, 19, 532, 22, 543...
4            5  {256, 130, 388, 648, 776, 907, 270, 405, 406, ...
...        ...                                                ...
1442      1656                                              {883}
1443      1658                                              {894}
1444      1662                                              {782}
1445      1664                                    {880, 870, 782}
1446      1674                                              {840}

[1447 rows x 2 columns]


In [None]:
# Use only those movies that are liked by atleast 20 users: NON-Optimized code
for i in range(gp.shape[0]):
  if len((gp.user_id)[i]) < 20:
    gp.drop(i, axis=0, inplace=True)


In [None]:
gp.shape

(621, 2)

In [None]:
print(gp)

      movie_id                                            user_id
0            1  {1, 2, 5, 6, 10, 16, 17, 18, 21, 23, 25, 38, 4...
1            2  {256, 640, 130, 387, 642, 648, 393, 276, 532, ...
2            3  {1, 130, 267, 523, 534, 663, 793, 923, 417, 55...
3            4  {514, 7, 10, 12, 13, 524, 16, 19, 532, 22, 543...
4            5  {256, 130, 388, 648, 776, 907, 270, 405, 406, ...
...        ...                                                ...
1084      1119  {907, 398, 270, 659, 532, 790, 796, 416, 299, ...
1102      1137  {257, 399, 918, 919, 568, 184, 63, 192, 201, 7...
1107      1142  {130, 903, 136, 392, 144, 533, 793, 160, 292, ...
1152      1194  {655, 916, 406, 409, 286, 543, 553, 440, 321, ...
1155      1197  {144, 277, 151, 792, 919, 411, 160, 168, 178, ...

[621 rows x 2 columns]


In [None]:
type(gp.iloc[0]['user_id'])



set

In [None]:
# For each movie, get similar movieS using Intersection over union(IoU)
# Code written for understandability rather than speed.
# This takes a while to run

movie_ids = list(gp.movie_id) 
similar_movies= {} # empty dictionary

cnt =0;

for movie_id in movie_ids:
  print(movie_id)
  # compute the IoU for this movie_id and others
  l = [] # list of tuples of each movie and IoU
  for i in range(gp.shape[0]):
    x = gp[ gp['movie_id'] == movie_id ]['user_id'].to_list()[0]
    y = gp.iloc[i]['user_id']
    
    t = (gp.iloc[i]['movie_id'], len(x & y)/len(x | y)) # (movie_id_i,IoU)
    l.append(t)

  similar_movies[movie_id] = l





1
2
3
4
5
7
8
9
10
11
12
13
14
15
17
19
20
22
23
24
25
26
27
28
29
30
31
32
33
38
39
42
44
45
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
76
77
79
81
82
83
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
106
107
108
109
111
114
116
117
118
121
123
124
125
126
127
128
129
131
132
133
134
135
136
137
140
141
142
143
144
147
148
150
151
152
153
154
155
156
157
159
160
161
162
163
164
165
166
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
245
246
248
249
250
251
252
255
257
258
259
260
262
264
265
268
269
270
271
272
273
274
275
276
277
278
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
297
298
299
300
301
302
303
304
305
306
307
310
311
312
313
315
316
317
318
319
321
322
323
3

In [None]:
# similar movies
idx = 1

l = similar_movies[idx]

# sort by similarities
l.sort(key = lambda x: x[1], reverse=True)

print("Given movie:")
print(movies[movies['movie_id']==idx]["title"])
print("*****************************")
print("Top-5 similar movies:")
print(movies[movies['movie_id']==l[0][0]]["title"])
print(movies[movies['movie_id']==l[1][0]]["title"])
print(movies[movies['movie_id']==l[2][0]]["title"])
print(movies[movies['movie_id']==l[3][0]]["title"])
print(movies[movies['movie_id']==l[4][0]]["title"])

Given movie:
0    Toy Story (1995)
Name: title, dtype: object
*****************************
Top-5 similar movies:
0    Toy Story (1995)
Name: title, dtype: object
49    Star Wars (1977)
Name: title, dtype: object
180    Return of the Jedi (1983)
Name: title, dtype: object
173    Raiders of the Lost Ark (1981)
Name: title, dtype: object
171    Empire Strikes Back, The (1980)
Name: title, dtype: object


In [None]:
print(l)

[(5, 1.0), (218, 0.21568627450980393), (159, 0.2), (219, 0.1891891891891892), (54, 0.1791044776119403), (566, 0.16101694915254236), (356, 0.16), (581, 0.15789473684210525), (402, 0.1574074074074074), (200, 0.152317880794702), (164, 0.14814814814814814), (466, 0.14516129032258066), (447, 0.14285714285714285), (559, 0.14285714285714285), (928, 0.140625), (53, 0.13924050632911392), (67, 0.1388888888888889), (418, 0.13829787234042554), (77, 0.13636363636363635), (794, 0.13559322033898305), (217, 0.13513513513513514), (393, 0.13333333333333333), (939, 0.1323529411764706), (38, 0.13114754098360656), (550, 0.12903225806451613), (385, 0.12857142857142856), (66, 0.12727272727272726), (946, 0.12727272727272726), (17, 0.1267605633802817), (123, 0.12658227848101267), (578, 0.125), (815, 0.125), (825, 0.125), (125, 0.1232876712328767), (147, 0.12195121951219512), (684, 0.12121212121212122), (781, 0.12121212121212122), (234, 0.12060301507537688), (729, 0.12), (31, 0.11864406779661017), (161, 0.11851