# Importing Libraries and reading the dataset

In [1]:
import pandas as pd

## 1. MovieLens dataset

In [2]:
df = pd.read_csv('merged_data.csv')
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,16,4.0,Casino (1995),Crime|Drama
1,1,4993,4.5,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
2,1,4963,3.5,Ocean's Eleven (2001),Crime|Thriller
3,1,4306,4.0,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,1,4262,5.0,Scarface (1983),Action|Crime|Drama


## 2. Movies dataset

In [3]:
df2 = pd.read_csv('movies.csv')
df2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Displaying the details about the dataframe

## 1. MovieLens dataset

In [4]:
df.shape

(105339, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   105339 non-null  int64  
 1   movieId  105339 non-null  int64  
 2   rating   105339 non-null  float64
 3   title    105339 non-null  object 
 4   genres   105339 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


## 2. Movies dataset

In [6]:
df2.shape

(10329, 3)

In [7]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


## Total unique users

In [8]:
print("Total unique users: ",len(df['userId'].unique()))

Total unique users:  668


## Total unique movies

In [9]:
print("Total unique users: ",len(df['movieId'].unique()))

Total unique users:  10325


## Total unique genres

In [10]:
genres = []
for genre in df.genres:
    x = genre.split('|')
    for i in x:
        if(i not in genres):
            genres.append(str(i))

In [11]:
print("All unique genres: \n")
print(genres)

All unique genres: 

['Crime', 'Drama', 'Adventure', 'Fantasy', 'Thriller', 'Animation', 'Children', 'Comedy', 'Romance', 'Action', 'War', 'Horror', 'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'Film-Noir', 'Western', 'Musical', '(no genres listed)']


In [12]:
print("Total unique genres: ",len(genres))

Total unique genres:  20


# Content based filering

## Choosing random user for recommendation

In [13]:
import random
user = random.randint(1,668)
print("User(User ID) chosen for recommendation: ",user)

User(User ID) chosen for recommendation:  85


## Getting details about user- his seen movies and highest rated movie genres

In [14]:
user1_movies = list(df['movieId'][df['userId'] == user])
print("Total movies seen by user: {}".format(len(user1_movies)))

Total movies seen by user: 51


In [15]:
max_rating = df['rating'][df['userId'] == user].max()
print("Maximum rating given by user: ", max_rating)

Maximum rating given by user:  5.0


## Getting top genres of the user's highest rated movies

In [16]:
l = list(df['genres'][(df['userId'] == user) & (df['rating'] == max_rating)])
genres = {}
for i in l:
    split_list = i.split('|')
    for j in split_list:
        if j not in genres:
            genres[j] = 1
        else:
            genres[j] += 1
genres = dict(sorted(genres.items(), key=lambda item: item[1]))
genres_list = list(genres.keys())[::-1]
print("Genres of all highest rated movies by the user:\n",genres_list)

Genres of all highest rated movies by the user:
 ['Drama', 'Fantasy', 'Adventure', 'Crime', 'Action', 'Thriller', 'Comedy', 'War', 'Romance', 'Sci-Fi', 'Western']


In [17]:
print("Genres with their count:")
print(genres)

Genres with their count:
{'Western': 1, 'Sci-Fi': 1, 'Romance': 1, 'War': 1, 'Comedy': 2, 'Thriller': 3, 'Action': 4, 'Crime': 4, 'Adventure': 4, 'Fantasy': 4, 'Drama': 8}


In [18]:
top_genres = genres_list[:min(4,len(genres_list))]
print("Top {} Genres: {}".format(len(top_genres),top_genres))

Top 4 Genres: ['Drama', 'Fantasy', 'Adventure', 'Crime']


# Calculating similarity of all the movies with respect to the top genres obtained 

## Movies dataframe

In [19]:
df2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Initializing the similarity column with 0 for all the movies

In [20]:
df2['Similarity'] = 0.0

In [21]:
df2.head()

Unnamed: 0,movieId,title,genres,Similarity
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0


## Defining cosine similarity function for obtaining the most similar movies to recommend to the user

In [22]:
def cosine_similarity(a,b):
    a = a.split('|')
    li = list(set(a + b))
    a1 = []
    b1 = []
    for i in li:
        a1.append(a.count(i))
        b1.append(b.count(i))
    
    num = 0
    d1 = 0
    d2 = 0
    for j in range(len(a1)):
        num += a1[j] * b1[j]
        d1 += a1[j] ** 2
        d2 += b1[j] ** 2
    d1 = d1 ** 0.5
    d2 = d2 ** 0.5
    return num/(d1*d2)

In [23]:
for z in range(df2.shape[0]):
    if df2['movieId'][z] not in user1_movies:
        df2.at[z,'Similarity'] = cosine_similarity(df2['genres'][z],top_genres)

## Sorting the dataframe by Similarity in descending order and recommending top 10 movies which are most similar

In [24]:
sorteddf = df2.sort_values(by = 'Similarity',ascending=False)
sorteddf[:10]

Unnamed: 0,movieId,title,genres,Similarity
1703,2147,"Clan of the Cave Bear, The (1986)",Adventure|Drama|Fantasy,0.866025
7719,59387,"Fall, The (2006)",Adventure|Drama|Fantasy,0.866025
8839,83480,Season of the Witch (2011),Adventure|Drama|Fantasy,0.866025
2710,3418,Thelma & Louise (1991),Adventure|Crime|Drama,0.866025
5616,8341,Oliver Twist (1948),Adventure|Crime|Drama,0.866025
7980,65359,Earthsea (Legend of Earthsea) (2004),Adventure|Drama|Fantasy,0.866025
9660,102194,Mud (2012),Adventure|Crime|Drama,0.866025
6862,41863,"Three Burials of Melquiades Estrada, The (2006)",Adventure|Crime|Drama,0.866025
2342,2931,Time of the Gypsies (Dom za vesanje) (1989),Comedy|Crime|Drama|Fantasy,0.75
1818,2297,What Dreams May Come (1998),Adventure|Drama|Fantasy|Romance,0.75


## Resetting the index

In [25]:
sorteddf.reset_index(drop=True, inplace=True)

# Results: Final Recommendation

In [26]:
print('\033[1m' + 'Recommended movies for user id: ' + str(user) + '\033[0m' + "\n")

for i in range(10):
    print('\033[1m' + f'{i+1}.  ' + sorteddf['title'][i]  + '\033[0m')

[1mRecommended movies for user id: 85[0m

[1m1.  Clan of the Cave Bear, The (1986)[0m
[1m2.  Fall, The (2006)[0m
[1m3.  Season of the Witch (2011)[0m
[1m4.  Thelma & Louise (1991)[0m
[1m5.  Oliver Twist (1948)[0m
[1m6.  Earthsea (Legend of Earthsea) (2004)[0m
[1m7.  Mud (2012)[0m
[1m8.  Three Burials of Melquiades Estrada, The (2006)[0m
[1m9.  Time of the Gypsies (Dom za vesanje) (1989)[0m
[1m10.  What Dreams May Come (1998)[0m


## Comparing genres of users top rated seen movies and recommend movies

In [27]:
print("User's favourite genres: ")
top_genres

User's favourite genres: 


['Drama', 'Fantasy', 'Adventure', 'Crime']

In [28]:
rec_genres = []
for genre in range(10):
    x = sorteddf['genres'][genre].split('|')
    for i in x:
        if(i not in rec_genres):
            rec_genres.append(str(i))
            
print("Genres of recommend movies: ")
rec_genres

Genres of recommend movies: 


['Adventure', 'Drama', 'Fantasy', 'Crime', 'Comedy', 'Romance']