# Movie Recommendation system

In [None]:
import pandas as pd
import numpy as np

: 

In [2]:
#Importing the datasets

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv('rating.csv')

In [3]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [4]:
movies["title"]

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [5]:
movies['genres']

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [6]:
movies.shape

(9742, 3)

In [7]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [8]:
#Merging the two datasets on "movieTd"

movie_ratings = pd.merge(movies,ratings,on='movieId')
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


### Data pre-processing using NLP

In [9]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
#Tokenizing the "genres" column
import re
def preprocess(text):
  text = re.sub('[^a-zA-Z0-9]', ' ',text)
  tokens = word_tokenize(text.lower())
  return " ".join(tokens)

movie_ratings["genres"] = movie_ratings["genres"].apply(preprocess)

In [11]:
#Removing year from movie title
def remove_year_slice(text):
  words = str(text).split()
  if words:
     return ' '.join(words[:-1])
movie_ratings["title"] = movie_ratings["title"].apply(remove_year_slice)

In [12]:
movie_ratings["title"]

0                                  Toy Story
1                                  Toy Story
2                                  Toy Story
3                                  Toy Story
4                                  Toy Story
                         ...                
100831    Black Butler: Book of the Atlantic
100832                 No Game No Life: Zero
100833                                 Flint
100834          Bungo Stray Dogs: Dead Apple
100835          Andrew Dice Clay: Dice Rules
Name: title, Length: 100836, dtype: object

In [13]:
movie_ratings.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [14]:
movie_ratings=movie_ratings[[ 'title','genres', 'userId', 'rating']]

In [15]:
movie_ratings.isnull().sum()

title     0
genres    0
userId    0
rating    0
dtype: int64

In [16]:
movie_ratings.shape

(100836, 4)

In [17]:
movie_ratings.head()

Unnamed: 0,title,genres,userId,rating
0,Toy Story,adventure animation children comedy fantasy,1,4.0
1,Toy Story,adventure animation children comedy fantasy,5,4.0
2,Toy Story,adventure animation children comedy fantasy,7,4.5
3,Toy Story,adventure animation children comedy fantasy,15,2.5
4,Toy Story,adventure animation children comedy fantasy,17,4.5


### Filtering movies with more popularity

In [18]:
num_of_ratings = movie_ratings.groupby('title')["rating"].count().reset_index()

In [19]:
num_of_ratings.rename(columns={'rating':'num_of_ratings'},inplace=True)

In [20]:
num_of_ratings.shape

(9444, 2)

In [21]:
num_of_ratings.head()

Unnamed: 0,title,num_of_ratings
0,,4
1,'71,1
2,'Hellboy': The Seeds of Creation,1
3,'Round Midnight,2
4,'Salem's Lot,1


In [22]:
final = pd.merge(movie_ratings,num_of_ratings,on='title',how='left')
final.shape

(100836, 5)

In [23]:
final= final[final['num_of_ratings']>=10]
final.shape

(81761, 5)

In [24]:

final.drop_duplicates(['title','userId'],inplace=True)
final.shape

(81422, 5)

In [25]:
final.head()

Unnamed: 0,title,genres,userId,rating,num_of_ratings
0,Toy Story,adventure animation children comedy fantasy,1,4.0,215
1,Toy Story,adventure animation children comedy fantasy,5,4.0,215
2,Toy Story,adventure animation children comedy fantasy,7,4.5,215
3,Toy Story,adventure animation children comedy fantasy,15,2.5,215
4,Toy Story,adventure animation children comedy fantasy,17,4.5,215


### Collaborative Filtering

In [26]:
#Creating a pivot table
pivot_table = final.pivot_table(index='title',columns='userId',values='rating')
pivot_table = pivot_table.fillna(0)
pivot_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Applying KNN to find similar movies

In [27]:
# Creating csr matrix to address data sparsity
from scipy.sparse import csr_matrix
csr_movie_ratings = csr_matrix(pivot_table.values)

In [28]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric='cosine',algorithm="brute")
model.fit(csr_movie_ratings)

In [29]:
movie_names = pivot_table.index

In [30]:
pivot_table.reset_index(drop=False)["title"].apply(preprocess)

0                        burbs the
1               500 days of summer
2              10 cloverfield lane
3       10 things i hate about you
4                        10 000 bc
                   ...            
2264                     zoolander
2265                      zootopia
2266                      existenz
2267                           xxx
2268                  three amigos
Name: title, Length: 2269, dtype: object

In [31]:
pivot_table

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Zootopia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0


In [79]:
def collaborative_recommend(movie_name):
    movie_name = str(movie_name).lower()  # Converting to lowercase for case-insensitive matching
    try:
        movie_id = np.where(pivot_table.reset_index(drop=False)["title"].apply(preprocess) == movie_name)[0][0]
        distance, suggestion = model.kneighbors(pivot_table.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=6 )

        for i in range(len(suggestion)):
            movies = pivot_table.index[suggestion[i]]
            for j in movies:
                if j == movie_name:
                    print(f"You searched '{movie_name}'\n")
                    print("The suggestion movies are: \n")
                else:
                    print(j)
    except IndexError:
        return f"No close matches found for '{movie_name}'"

In [80]:
collaborative_recommend('mission impossible')

Mission: Impossible
Independence Day (a.k.a. ID4)
Jurassic Park
Twister
Rock, The
GoldenEye


### Content based filtering

In [35]:
content = final[['title','genres']].drop_duplicates()
content.head()

Unnamed: 0,title,genres
0,Toy Story,adventure animation children comedy fantasy
215,Jumanji,adventure children fantasy
325,Grumpier Old Men,comedy romance
384,Father of the Bride Part II,comedy
433,Heat,action crime thriller


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
genre_matrix = vectorizer.fit_transform(content["genres"])

In [37]:
genre_matrix.astype("float32")

<2389x24 sparse matrix of type '<class 'numpy.float32'>'
	with 6812 stored elements in Compressed Sparse Row format>

### Using cosine similarity to find movies with similar genre

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(genre_matrix,genre_matrix)

In [58]:
# Reverse mapping of indices and movie titles
indices = pd.Series(content.index, index=content['title'].apply(preprocess)).drop_duplicates()

In [75]:
def content_recommend(movie):
    movies = []
    movie = str(movie).lower()
    idx = indices[movie]
    if idx is None:
        return "No close matches found for '{movie}'"
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    print("The suggestions are:\n ")
    for j in content['title'].iloc[movie_indices]:
            print(j)

In [94]:
content_recommend('mission impossible')

IndexError: index 17397 is out of bounds for axis 0 with size 2389

### Hybrid Recommender

In [97]:
def hybrid_recommendation(movie_name):

  movie_name = str(movie_name).lower()  # Convert to lowercase for case-insensitive matching
  try:
    movie_id = np.where(pivot_table.reset_index(drop=False)["title"].apply(preprocess) == movie_name)[0][0]
    distance, suggestion = model.kneighbors(pivot_table.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=6)
    for i in range(len(suggestion)):
            movies = pivot_table.index[suggestion[i]].tolist()
    try:
      idx = indices[movie_name]
      if idx is not None:
        sim_scores = list(enumerate(similarity[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:6]
        movie_indices = [i[0] for i in sim_scores]
        content_movies = content['title'].iloc[movie_indices].tolist()

    except IndexError:
      content_movies = []
    # Combine recommendations
    combined_movies = list(set(movies) | set(content_movies))

    print(f"You searched '{movie_name}'\n")

    if not combined_movies:
      print(f"No close matches found for '{movie_name}'")
    else:
      print("The suggestions are:\n")
      for movie in combined_movies:
        print(movie)

  except IndexError:
    print(f"No close matches found for '{movie_name}'")

In [99]:
movie_name = input("Enter a movie: ")
hybrid_recommendation(movie_name)

Enter a movie: toy story
You searched 'toy story'

The suggestions are:

Antz
Independence Day (a.k.a. ID4)
Forrest Gump
Monsters, Inc.
Emperor's New Groove, The
Star Wars: Episode IV - A New Hope
Shrek the Third
Jurassic Park
Toy Story
Toy Story 2
