# Recommender System on Movies
* types
1. content based: similar content based on current activity; tags are created.
2. collaborative based: on the user's interest; based on users similarity the content gets recommended; posts, feeds, etc
3. hybrid: both approach are used.


# Flow

Data -> Preprocessing -> model -> website -> deploy

In [None]:
import numpy as np
import pandas as pd

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/movies_dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/drive/MyDrive/movies_dataset/tmdb_5000_credits.csv')

In [None]:
movies.head(1)

In [None]:
credits.head(1)
# credits.head(1)['cast'].values

In [None]:
# we need to merge the two dataframes to get one on the basis of one column
movies = movies.merge(credits, on = 'title')

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
# crucial columns for content based recommending system as we create tags
# genre
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.head()

In [None]:
# we have to generate a new dataframe of 3 columns- id, title, tags
# tags = overview + genre + keywords + cast(top three cast) + crew(dir)
# for each movie we'll get a paragraph consisting of this info
# perform data preprocessing to fill in missing data, correct the format

In [None]:
# missing data
movies.isnull().sum()

In [None]:
# we got 3 such movies whose overview does not exist = drop them
movies.dropna(inplace = True)

In [None]:
# checking duplicate data
movies.duplicated().sum()


In [None]:
movies.iloc[0].genres

In [None]:
# [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
# to
# ['Action', 'Adventure', 'Fantasy', 'SciFi']

def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L


In [None]:
# first convert the string of list into list then it will be in correct format for next conversion
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head()

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
movies['cast'][0]

In [None]:
# for getting first 3 cast members

def convert3(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter !=3:
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
movies.head()

In [None]:
movies['crew'][0]

In [None]:
def fetch_dir(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_dir)

In [None]:
movies.head()

In [None]:
# overview is a list must be converted to a list to concatenate it with others
movies['overview'][0]

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.head()

* till here we have got all the list that will get merged with others to get a list and then will convert it to a string -> paragraph -> will work as our tag column

In [None]:
# we have to apply a transformation for removing spaces in between the words so that they work as single entity for that person
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [None]:
movies.head()

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
movies.head()

In [None]:
movies['tags'][0]

In [None]:
new_df = movies[['movie_id','title','tags']]

In [None]:
new_df

In [None]:
# tags list to string
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

# Text Vectorization
* calculate similarity between the tags or data
* each text converts to vector
* closest vectors movies will get recommended upon the input of the user
* texhnique: Bag of Words - combine all the tags -> large text -> picks certain amount of words(n) whose frequency is most common and extracted -> then each word is checked against each movie giving a number by how many times it occured in that movie tag -> so such data will give a dataframe of such values where each row is a now a 'vector' for that movie in n D space -> size(5000,n) (movies, words)
* stop words must be ignored for vectorization- useful for english language sentence formation.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [None]:
# by default there will be some zeroes values
# this returns a sparse matix which we will explicitly converts to numpy array

vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors

In [None]:
# for movie 1 the vector is
vectors[0]

In [None]:
cv.get_feature_names_out()

In [None]:
# apply stemming for similar words that occur - actor-actors, love-loving-loved etc
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [None]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron')

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
cv.get_feature_names_out()

* now calculate the distance of the movie vectors from other two movies vectors adjacent to it
* not use euclidian distance
* Cosine Distance which measures the angle between the two vectors
 * smaller the angle, more similar the movies are
 * euclidian distance fails in higher dimension problems


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# gives a similarity score
similarity = cosine_similarity(vectors)

In [None]:
similarity

In [None]:
# to maintain the order of the movies and their similarity with each other
# distance of 0th movie with itself, then with 1st, then 2nd
# sorting we are performing on the basis of second number that is the similarity
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x:x[1])[1:6]

In [None]:
sorted(similarity[0], reverse = True)

In [None]:
# to fetch the index of the movie from the dataframe
new_df[new_df['title'] == 'Batman Begins'].index[0]

In [None]:
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recommend('Pirates of the Caribbean: At World\'s End')

In [None]:
# to send this list into our code for the website
import pickle

In [None]:
pickle.dump(new_df, open('movies.pkl', 'wb'))

In [None]:
pickle.dump(similarity, open('similarity.pkl','wb'))