this is a basic recommender system implemantaion that uses content-based filtering to suggest movies.

In [40]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
df.shape

(9742, 3)

In [13]:
df.info()     # here title and genres is string, but movieId is an integer 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [15]:
df.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [17]:
df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [28]:
# combine features to compute the similarity between movies
features = []          # movieId  , title  , genres
for i in range(len(df)):
    features.append(str(df["movieId"][i]) + " " + df["title"][i] + " " + df["genres"][i])

In [30]:
features[:5]

['1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy',
 '2 Jumanji (1995) Adventure|Children|Fantasy',
 '3 Grumpier Old Men (1995) Comedy|Romance',
 '4 Waiting to Exhale (1995) Comedy|Drama|Romance',
 '5 Father of the Bride Part II (1995) Comedy']

In [34]:
df["features"] = features

In [36]:
df.head()

Unnamed: 0,movieId,title,genres,features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1 Toy Story (1995) Adventure|Animation|Childre...
1,2,Jumanji (1995),Adventure|Children|Fantasy,2 Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,3 Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,4 Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,5 Father of the Bride Part II (1995) Comedy


In [46]:
# here we vectorized our data I mean بردار سازی کردن داده ها
# A matrix of numerical data
vectorized_data = CountVectorizer().fit_transform(df["features"])

In [54]:
# with cosine similarity we can compute the similarities between movies 
# range 0(no similarity) to 1(perfect similarity) 
# rows = movies [i]
# columns = movies [j]
# [ij] represent how movie i and j  is similar to each other :)
recommender_core = cosine_similarity(vectorized_data)

In [72]:
print(recommender_core)

[[1.         0.63245553 0.28867513 ... 0.         0.11785113 0.1118034 ]
 [0.63245553 1.         0.18257419 ... 0.         0.         0.        ]
 [0.28867513 0.18257419 1.         ... 0.         0.         0.12909944]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.11785113 0.         0.         ... 0.         1.         0.        ]
 [0.1118034  0.         0.12909944 ... 0.         0.         1.        ]]


In [80]:
recommender_core.shape  # as you see we produce a matrix which contains all movies in rows and columns
# recommender_core represent [index , similarity]

(9742, 9742)

In [86]:
recommender_core[0]

array([1.        , 0.63245553, 0.28867513, ..., 0.        , 0.11785113,
       0.1118034 ])

In [106]:
df["movieId"][3819]  # it gives you a movieId

5349

In [108]:
df["title"][3819]  # it gives you the title

'Spider-Man (2002)'

In [142]:
# we give the movieId to get the actual location in our df
df.loc[df["movieId"] == 5349 , "title"] 

3819    Spider-Man (2002)
Name: title, dtype: object

In [136]:
def get_similarity(item):
    return item[1]

# the important thing here is that we want the sorted list to show us the similarity of movies with <<3819 - Spider Man>>
# as you see 3819 get 1, because it has perfect similarity, I mean they are the same movie

sorted_list = sorted(list(enumerate(recommender_core[3819])),  key=get_similarity, reverse=True)
sorted_list[:10]

[(3819, 1.0000000000000002),
 (6470, 0.7378647873726218),
 (3768, 0.7071067811865475),
 (5260, 0.6666666666666667),
 (7324, 0.6324555320336759),
 (4118, 0.6299407883487119),
 (7927, 0.6030226891555273),
 (385, 0.5892556509887895),
 (1194, 0.5892556509887895),
 (2712, 0.5892556509887895)]

In [168]:
# iloc(0): Accesses the first row
# [0]: Gets the value of the title
def find_movie_info(index):
    movie_id = df["movieId"][index]
    movie_title = df.loc[df["movieId"] == movie_id, "title"].iloc(0)[0]         
    movie_genres = df.loc[df["movieId"] == movie_id, "genres"].iloc(0)[0]
    movie_genres = movie_genres.split("|")
    
    return movie_id, movie_title, movie_genres

In [178]:
def suggest_movie(index):
    sorted_list = sorted(list(enumerate(recommender_core[index])),  key=get_similarity, reverse=True)
    sorted_list = sorted_list[:10]
    
    similar_movies = []
    for movie in sorted_list:
        try:
            movie = find_movie_info(movie[0])
            similar_movies.append(movie)
        except:
            pass
        
    return similar_movies

In [182]:
# now our sys could suggest movie based on every indeces you want, like here it suggests movies based on Toy story! :)
suggest_movie(1)

[(2, 'Jumanji (1995)', ['Adventure', 'Children', 'Fantasy']),
 (1,
  'Toy Story (1995)',
  ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']),
 (2103, 'Tall Tale (1995)', ['Adventure', 'Children', 'Fantasy', 'Western']),
 (158, 'Casper (1995)', ['Adventure', 'Children']),
 (146, 'Amazing Panda Adventure, The (1995)', ['Adventure', 'Children']),
 (8, 'Tom and Huck (1995)', ['Adventure', 'Children']),
 (13, 'Balto (1995)', ['Adventure', 'Animation', 'Children']),
 (243, 'Gordy (1995)', ['Children', 'Comedy', 'Fantasy']),
 (130450, 'Pan (2015)', ['Adventure', 'Children', 'Fantasy']),
 (60,
  'Indian in the Cupboard, The (1995)',
  ['Adventure', 'Children', 'Fantasy'])]