# Movie Recommendation System Assignment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install surprise


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163000 sha256=0449ae6e8ba8b1a89e0ca7d0c5b3920e30298363697dafe2ec674f16539abc46
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [3]:
# Importing some required and common libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
movies = pd.read_csv("/content/drive/MyDrive/Recomm_data/movies.csv")   # data form the csv file
ratings = pd.read_csv("/content/drive/MyDrive/Recomm_data/ratings.csv")  # data from the csv file

In [5]:
ratings.head() # what's inside the data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
movies # what's inside the movies data

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


# The large dataset is causing the ram to overflow , that's why some measures have been taken to reduce the data size with proper reasoning. Here I am removing those user who has only rated 5 or  less number of movies.

In [7]:
# counting the the number of movies voted by each person
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')


In [8]:
# printing it
no_user_voted

movieId
1         57309
2         24228
3         11804
4          2523
5         11714
          ...  
209157        1
209159        1
209163        1
209169        1
209171        1
Name: rating, Length: 59047, dtype: int64

In [9]:
# selecting only those user_id who have rated more than 5 movies
final_dataset = ratings.loc[no_user_voted[no_user_voted > 5].index,:]

In [10]:
final_dataset # modified dataset

Unnamed: 0_level_0,userId,movieId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
...,...,...,...,...
208385,1477,79287,2.0,1558559981
208715,1478,1193,5.0,1369508466
208737,1478,1259,4.5,1369508707
208747,1478,1307,4.5,1369508660


In [11]:
# the size of the modified dataset
final_dataset['movieId'].unique().shape

(5929,)

In [12]:
# this are the movies which are included in the final_dataset
movies_used = movies[movies["movieId"].isin(final_dataset['movieId'].unique())].reset_index(drop = True)

In [13]:
import pandas as pd # importing pandas
from sklearn.feature_extraction.text import TfidfVectorizer # importing TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  # importing the cosine similarity


movies_used['genres'] = movies_used['genres'].str.lower() # lower all the test so genres written in different capital letter are not different


tfidf = TfidfVectorizer(stop_words='english')  # considering only the english words and omitting the symbols
tfidf_matrix = tfidf.fit_transform(movies_used['genres'])  # fitting the genres data in the tfidfvectorizer and transforming it into a dataset


cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)  # consine similarity calculates the similarity of the given input and datapoints similar to those inputs

def get_recommendations(title, cosine_sim=cosine_sim): # functions to get top 10 similar movies by genre
    idx = movies_used[movies_used['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  ## sorting all the movies and recommending top 10
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return movies_used['title'].iloc[movie_indices]


## printing the top 10 movies similar to toy story(1995)
movie_title = "Toy Story (1995)"
recommendations = get_recommendations(movie_title)
print("Recommendations for", movie_title, ":")
print(recommendations)


Recommendations for Toy Story (1995) :
1425                                       Antz (1998)
1929                                Toy Story 2 (1999)
2239    Adventures of Rocky and Bullwinkle, The (2000)
2365                  Emperor's New Groove, The (2000)
2674                             Monsters, Inc. (2001)
4163                            Shrek the Third (2007)
4419                    Tale of Despereaux, The (2008)
5275                             Boxtrolls, The (2014)
5709                                      Moana (2016)
5449                                 Inside Out (2015)
Name: title, dtype: object


In [14]:
## ## printing the top 10 movies similar to Jumanji (1995)
movie_title = "Jumanji (1995)"
recommendations = get_recommendations(movie_title)
print("Recommendations for", movie_title, ":")
recommendations

Recommendations for Jumanji (1995) :


50                     Indian in the Cupboard, The (1995)
100                     NeverEnding Story III, The (1994)
647                       Escape to Witch Mountain (1975)
1279            Darby O'Gill and the Little People (1959)
1316                                  Return to Oz (1985)
1371                        NeverEnding Story, The (1984)
1372    NeverEnding Story II: The Next Chapter, The (1...
2678    Harry Potter and the Sorcerer's Stone (a.k.a. ...
3917    Chronicles of Narnia: The Lion, the Witch and ...
4099                          Bridge to Terabithia (2007)
Name: title, dtype: object

## Most of the movies predicted by the cosine similarity are similary in genre , It is performing nicely according to the given task for a content based recommendation system.

In [15]:
final_dataset # modified dataset of ratings

Unnamed: 0_level_0,userId,movieId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
...,...,...,...,...
208385,1477,79287,2.0,1558559981
208715,1478,1193,5.0,1369508466
208737,1478,1259,4.5,1369508707
208747,1478,1307,4.5,1369508660


In [16]:
#importing the required libraries from suprise
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import CoClustering


reader = Reader(rating_scale=(0.5, 5.0))  # this code so that surprise can understand our rating properly
data = Dataset.load_from_df(final_dataset[['userId', 'movieId', 'rating']], reader) # Surprise dataset

# #  Spliting the Data in train and test
trainset, testset = train_test_split(data , test_size=0.2, random_state=42)


from surprise import SVD, KNNBasic, NMF, SlopeOne, CoClustering # importing various models to find which performs the best
from surprise import accuracy # importing accuracies to calculate the accuracy on the test model

# Choose Algorithms
algos = {
    "SVD": SVD(),
    "KNNBasic": KNNBasic(),
    "NMF": NMF(),
    "SlopeOne": SlopeOne(),
    "CoClustering": CoClustering()
}

# Training and Evaluating Models
for name, algo in algos.items():
    print("Training model:", name)
    algo.fit(trainset)
    predictions = algo.test(testset)
    print("RMSE for", name, ":", accuracy.rmse(predictions))

# Making Predictions
user_id = 1
item_id = 10
for name, algo in algos.items():
    pred = algo.predict(user_id, item_id)
    print('Prediction for user', user_id, 'on item', item_id, 'using', name, ':', pred.est)



Training model: SVD
RMSE: 0.9156
RMSE for SVD : 0.9156273551795081
Training model: KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0331
RMSE for KNNBasic : 1.0331037724284464
Training model: NMF
RMSE: 1.0281
RMSE for NMF : 1.0281246447829744
Training model: SlopeOne
RMSE: 1.0303
RMSE for SlopeOne : 1.0302816463729605
Training model: CoClustering
RMSE: 1.0229
RMSE for CoClustering : 1.0228675820546043
Prediction for user 1 on item 10 using SVD : 3.4331409321913733
Prediction for user 1 on item 10 using KNNBasic : 3.5696477044701056
Prediction for user 1 on item 10 using NMF : 3.5068017781133403
Prediction for user 1 on item 10 using SlopeOne : 3.418103448275862
Prediction for user 1 on item 10 using CoClustering : 3.548465474820397


In our case the SVD is giving the best performance

In [17]:

# this our trained SVD model from the algos dictionary
svd = algos["SVD"]


# Generating the predictions for all movies for the target user
user_id = 9
all_movie_ids = final_dataset['movieId'].unique()
user_movie_ids = final_dataset[final_dataset['userId'] == user_id]['movieId'].unique()

# Excluding movies already rated by the user
movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in user_movie_ids]

# Generating the  predictions
predictions = [svd.predict(user_id, movie_id) for movie_id in movies_to_predict]

# Sorting predictions by estimated rating (est)
predictions.sort(key=lambda x: x.est, reverse=True)

# Displaying the top 10 recommended movies
top_10_recommendations = predictions[:10]
for i, pred in enumerate(top_10_recommendations, 1):
    print(f"Rank {i}:  {movies[movies['movieId'] == pred.iid]['title'].values[0]}, Estimated Rating: {pred.est}")


Rank 1:  Dark Knight, The (2008), Estimated Rating: 4.917687576156508
Rank 2:  Good Will Hunting (1997), Estimated Rating: 4.6560864405980125
Rank 3:  Shawshank Redemption, The (1994), Estimated Rating: 4.650126098487555
Rank 4:  Gone Girl (2014), Estimated Rating: 4.645274822070393
Rank 5:  Player, The (1992), Estimated Rating: 4.589354774456076
Rank 6:  Big Lebowski, The (1998), Estimated Rating: 4.588862273275251
Rank 7:  Monty Python and the Holy Grail (1975), Estimated Rating: 4.587167184881679
Rank 8:  Lord of the Rings: The Two Towers, The (2002), Estimated Rating: 4.5815265370619915
Rank 9:  Children of Men (2006), Estimated Rating: 4.580352100180579
Rank 10:  King's Speech, The (2010), Estimated Rating: 4.569636983633563


# Comparing the two content and collaborative filtering method for user 1

### Below are the movies suggested to user1 on the basis of collaborative filtering methods means It recommends items that other users with similar tastes have liked or interacted with.


In [18]:
user_id = 1
all_movie_ids = final_dataset['movieId'].unique()
user_movie_ids = final_dataset[final_dataset['userId'] == user_id]['movieId'].unique()

# Excluding movies already rated by the user
movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in user_movie_ids]

# Generating the  predictions
predictions = [svd.predict(user_id, movie_id) for movie_id in movies_to_predict]

# Sorting predictions by estimated rating (est)
predictions.sort(key=lambda x: x.est, reverse=True)

collaborative_based_user1 = []
# Displaying the top 10 recommended movies
top_10_recommendations = predictions[:10]
for i, pred in enumerate(top_10_recommendations, 1):
    print(f"Rank {i}:  {movies[movies['movieId'] == pred.iid]['title'].values[0]}, Estimated Rating: {pred.est}")
    collaborative_based_user1.append(movies[movies['movieId'] == pred.iid]['title'].values[0])

Rank 1:  Monty Python and the Holy Grail (1975), Estimated Rating: 4.695369092198495
Rank 2:  Inception (2010), Estimated Rating: 4.464522760834631
Rank 3:  Godfather: Part II, The (1974), Estimated Rating: 4.441352078049603
Rank 4:  Citizen Kane (1941), Estimated Rating: 4.439355541875993
Rank 5:  Sixth Sense, The (1999), Estimated Rating: 4.437123909782233
Rank 6:  Django Unchained (2012), Estimated Rating: 4.40767735848711
Rank 7:  Shawshank Redemption, The (1994), Estimated Rating: 4.404689559442479
Rank 8:  Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966), Estimated Rating: 4.402264124592288
Rank 9:  Princess Bride, The (1987), Estimated Rating: 4.400774132473106
Rank 10:  Casablanca (1942), Estimated Rating: 4.400295717256189


### Suggesting movies based on the content based filtering means the user will be suggested those movies in which It recommends items that are similar to those previously liked by the user.


In [21]:
#  [movies_used['userId'] == 1]
user_1_genres = ""
for mov_ids in final_dataset[final_dataset['userId'] == 1]['movieId']:
  user_1_genres = user_1_genres+ " " + movies[movies['movieId'] == mov_ids]['genres'].values[0]

In [22]:
print("User 1 Genres :" )
print(user_1_genres)



User 1 Genres :
 Drama Drama Comedy|Drama|War Comedy|Musical|Romance Drama|Musical|Romance Comedy|Drama|Romance Drama|War Drama Adventure|Drama|War Crime|Film-Noir|Thriller Drama|Sci-Fi|Thriller Adventure|Comedy|Sci-Fi Adventure|Comedy|Sci-Fi|Western Drama|Fantasy|Mystery Adventure|Children|Fantasy Drama Drama|Musical Adventure|Drama|Mystery Action|Crime Comedy|Romance Comedy|Drama|War Comedy|Drama Drama Drama|Romance Drama|Musical|Romance Drama Drama Drama Comedy|Romance Drama Drama Documentary Comedy|Crime Drama|Romance Comedy|Crime Adventure|Fantasy Action|Adventure|Crime|Drama|Thriller Comedy|Drama|Romance Adventure|Animation|Children|Comedy Action|Adventure|Comedy|Fantasy Comedy|Drama|Romance Comedy|Crime|Drama|Mystery|Romance Comedy Drama Drama Comedy|Drama Drama Drama|Romance|Sci-Fi Drama Crime|Drama Drama Drama Drama Drama Drama Drama Drama|Romance Adventure|Animation|Children|Comedy|Musical|Romance Drama|Horror Comedy|Drama Drama Drama Adventure|Drama Drama|Thriller Action|Com

In [23]:
tfidf_matrix_user1 = tfidf.transform(pd.Series(user_1_genres))  #  transforming the genres data into a tf-idf dataset


cosine_similarity(tfidf_matrix, tfidf_matrix_user1)  # consine similarity calculates the similarity of the given input and datapoints similar to those inputs


sim_scores = list(enumerate(cosine_similarity(tfidf_matrix, tfidf_matrix_user1) ))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]  # Top 10 similar movies
movie_indices = [i[0] for i in sim_scores]
movies_used['title'].iloc[movie_indices]
print("Movies recommended to User_1 based on content he/she watched : ")
print("\n")
content_based_user1 = movies_used['title'].iloc[movie_indices]
print(content_based_user1)

Movies recommended to User_1 based on content he/she watched : 


9                   American President, The (1995)
46                         Mighty Aphrodite (1995)
49               Postman, The (Postino, Il) (1994)
77                          Beautiful Girls (1996)
146                 Something to Talk About (1995)
166                        Don Juan DeMarco (1995)
172    Eat Drink Man Woman (Yin shi nan nu) (1994)
212                           Nobody's Fool (1994)
263                        Corrina, Corrina (1994)
271                  It Could Happen to You (1994)
Name: title, dtype: object


In [24]:
print( "Collaborative based Recommendations for USER_1 : " , "\n")

print( pd.Series(collaborative_based_user1)  )
print("\n")
print("\n")
print( "Content based Recommendations USER_1 : " , "\n")
print(content_based_user1)

Collaborative based Recommendations for USER_1 :  

0               Monty Python and the Holy Grail (1975)
1                                     Inception (2010)
2                       Godfather: Part II, The (1974)
3                                  Citizen Kane (1941)
4                              Sixth Sense, The (1999)
5                              Django Unchained (2012)
6                     Shawshank Redemption, The (1994)
7    Good, the Bad and the Ugly, The (Buono, il bru...
8                           Princess Bride, The (1987)
9                                    Casablanca (1942)
dtype: object




Content based Recommendations USER_1 :  

9                   American President, The (1995)
46                         Mighty Aphrodite (1995)
49               Postman, The (Postino, Il) (1994)
77                          Beautiful Girls (1996)
146                 Something to Talk About (1995)
166                        Don Juan DeMarco (1995)
172    Eat Drink Man Woman (Yin 

## As seen both recommended movies are completely different , and that is the reason we use different recommendation method so that user can get the best movie recommended