# RECOMMENDATION ENGINE FOR TOP IMDB MOVIES

In [1]:
## Importing the necessary Libraries 

import pandas as pd
import re
import time
import json
import ast
import random

## The Dataset<br>
**This dataset is created by me using the TMDB api:<br>
It has the following columns**<br>

1. **movie_id** :A unique movie id assigned by TMDB
2. **title** : Movie title in english
3. **overview** : A small description of the movie
4. **original_language** : Language spoken in movie
5. **release_date** : Release date of the movie
6. **vote_average** : Average voting of the movie as per TMDB
7. **cast** : A list of details of actors playing in movie
8. **crew** : A list of the details of crew members of the movie
9. **Genre** : List of genre of the movie
10. **Keywords** : Keywords (highlighted words) of the movie

In [2]:
movies = pd.read_csv('./Full_Movies_with_keywords_final.csv')

In [3]:
movies

Unnamed: 0.1,Unnamed: 0,movie_id,title,overview,original_language,release_date,vote_average,cast,crew,Genre,keywords
0,0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,1972-03-14,8.7,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 697, 'na..."
1,1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,1994-09-23,8.7,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...","['Drama', 'Crime']","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
2,2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,1974-12-20,8.6,"[{'adult': False, 'gender': 2, 'id': 1158, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 700, 'na..."
3,3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,1993-12-15,8.6,"[{'adult': False, 'gender': 2, 'id': 3896, 'kn...","[{'adult': False, 'gender': 2, 'id': 491, 'kno...","['Drama', 'History', 'War']","[{'id': 818, 'name': 'based on novel or book'}..."
4,4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,1995-10-19,8.6,"[{'adult': False, 'gender': 2, 'id': 35742, 'k...","[{'adult': False, 'gender': 1, 'id': 8311, 'kn...","['Comedy', 'Drama', 'Romance']",[]
...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,9037,Grease 2,"It's 1961, two years after the original Grease...",en,1982-06-11,5.1,"[{'adult': False, 'gender': 2, 'id': 101908, '...","[{'adult': False, 'gender': 0, 'id': 10632, 'k...","['Comedy', 'Music', 'Romance']","[{'id': 4344, 'name': 'musical'}, {'id': 8290,..."
9996,9996,123103,Aftershock,Mayhem and death follow when an earthquake tra...,es,2012-09-12,5.1,"[{'adult': False, 'gender': 2, 'id': 16847, 'k...","[{'adult': False, 'gender': 2, 'id': 1307, 'kn...","['Horror', 'Thriller']","[{'id': 570, 'name': 'rape'}, {'id': 3521, 'na..."
9997,9997,99770,Asterix & Obelix: God Save Britannia,Asterix crosses the channel to help second-cou...,fr,2012-10-17,5.1,"[{'adult': False, 'gender': 2, 'id': 41035, 'k...","[{'adult': False, 'gender': 2, 'id': 122, 'kno...","['Family', 'Adventure', 'Comedy']","[{'id': 3035, 'name': 'roman'}, {'id': 9717, '..."
9998,9998,15045,Fat Albert,Animated character Fat Albert emerges from his...,en,2004-12-25,5.1,"[{'adult': False, 'gender': 2, 'id': 77330, 'k...","[{'adult': False, 'gender': 2, 'id': 2043, 'kn...","['Comedy', 'Family', 'Fantasy']","[{'id': 6054, 'name': 'friendship'}, {'id': 62..."


In [4]:
movies.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
movies.isnull().sum()

movie_id             0
title                0
overview             5
original_language    0
release_date         0
vote_average         0
cast                 0
crew                 0
Genre                0
keywords             0
dtype: int64

**There are 5 nan values in overview column, we will drop these movies later on**

In [6]:
movies.shape

(10000, 10)

## Strategy:<br>
**Now to make the recommendation engine for the movies we need to create tags (token) for each movie so that each movie can be represented as a vector.For that we will make use of the** <br>
- overview
- cast
- crew
- original_language
- genre
- keywords<br>

**columns of the dataset as these are important things that we require to make vectors for the movies.**

In [7]:
movies.head()

Unnamed: 0,movie_id,title,overview,original_language,release_date,vote_average,cast,crew,Genre,keywords
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,1972-03-14,8.7,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 697, 'na..."
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,1994-09-23,8.7,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...","['Drama', 'Crime']","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,1974-12-20,8.6,"[{'adult': False, 'gender': 2, 'id': 1158, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 700, 'na..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,1993-12-15,8.6,"[{'adult': False, 'gender': 2, 'id': 3896, 'kn...","[{'adult': False, 'gender': 2, 'id': 491, 'kno...","['Drama', 'History', 'War']","[{'id': 818, 'name': 'based on novel or book'}..."
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,1995-10-19,8.6,"[{'adult': False, 'gender': 2, 'id': 35742, 'k...","[{'adult': False, 'gender': 1, 'id': 8311, 'kn...","['Comedy', 'Drama', 'Romance']",[]


In [8]:
movies.columns

Index(['movie_id', 'title', 'overview', 'original_language', 'release_date',
       'vote_average', 'cast', 'crew', 'Genre', 'keywords'],
      dtype='object')

In [9]:
## Taking the  overview, cast, crew, original_language, genre, keywords columns of the dataset.

columns  = [ 'movie_id','title','overview','original_language','cast','crew','Genre','keywords']
df = movies[columns]
df.head()

Unnamed: 0,movie_id,title,overview,original_language,cast,crew,Genre,keywords
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 697, 'na..."
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...","['Drama', 'Crime']","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,"[{'adult': False, 'gender': 2, 'id': 1158, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 700, 'na..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,"[{'adult': False, 'gender': 2, 'id': 3896, 'kn...","[{'adult': False, 'gender': 2, 'id': 491, 'kno...","['Drama', 'History', 'War']","[{'id': 818, 'name': 'based on novel or book'}..."
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,"[{'adult': False, 'gender': 2, 'id': 35742, 'k...","[{'adult': False, 'gender': 1, 'id': 8311, 'kn...","['Comedy', 'Drama', 'Romance']",[]


## Some dataset preprocessing: <br>
Cast column
- we will extract the first for character names from the cast column as these are main characters of the movie.

Crew column
- People generally care more about the directors of the movie so we will extract the director name from crew column.

Keyword column
- converting the list entries into normal strings

Genre column
- converting the list entries in genre to normal strings


In [10]:
## Getiing the first four characters real names from cast column

def fetch(text):
    name = []
    if len(ast.literal_eval(text))>=4:
        for i in range(4):
            name.append(ast.literal_eval(text)[i]['original_name'].replace(' ',''))
        return ' '.join(name)
    else:
        for i in range(len(ast.literal_eval(text))): ## for movies having cast less than 4
            name.append(ast.literal_eval(text)[i]['original_name'].replace(' ',''))
        return ' '.join(name)

In [11]:
df['cast4'] = df['cast'].apply(fetch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cast4'] = df['cast'].apply(fetch)


In [13]:
## Getting the director name from the crew:

def fetch_director(text):
    for dic in ast.literal_eval(text):
        if dic['job']== 'Director':
            name  = dic['original_name']
            break
    return name.replace(' ','')

In [14]:
df['director'] = df['crew'].apply(fetch_director)

In [20]:
## Working on keywords column

def key(text):
    keys = []
    for i in ast.literal_eval(text):
        keys.append(i['name'].replace(' ',''))
    return ' '.join(keys)

In [18]:
df['keyword'] = df['keywords'].apply(key)

In [23]:
df.head()

Unnamed: 0,movie_id,title,overview,original_language,cast,crew,Genre,keywords,cast4,director,keyword
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 697, 'na...",MarlonBrando AlPacino JamesCaan RobertDuvall,FrancisFordCoppola,italy lossoflovedone loveatfirstsight basedonn...
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...","['Drama', 'Crime']","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n...",TimRobbins MorganFreeman BobGunton WilliamSadler,FrankDarabont,prison corruption policebrutality basedonnovel...
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,"[{'adult': False, 'gender': 2, 'id': 1158, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno...","['Drama', 'Crime']","[{'id': 131, 'name': 'italy'}, {'id': 700, 'na...",AlPacino RobertDuvall DianeKeaton RobertDeNiro,FrancisFordCoppola,italy italianamerican cuba symbolism gangster ...
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,"[{'adult': False, 'gender': 2, 'id': 3896, 'kn...","[{'adult': False, 'gender': 2, 'id': 491, 'kno...","['Drama', 'History', 'War']","[{'id': 818, 'name': 'based on novel or book'}...",LiamNeeson BenKingsley RalphFiennes CarolineGo...,StevenSpielberg,basedonnovelorbook factory concentrationcamp h...
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,"[{'adult': False, 'gender': 2, 'id': 35742, 'k...","[{'adult': False, 'gender': 1, 'id': 8311, 'kn...","['Comedy', 'Drama', 'Romance']",[],ShahRukhKhan Kajol AmrishPuri AnupamKher,AdityaChopra,


In [24]:
## Working on Genre
df_new['Genre'] = df_new['Genre'].str.replace(' ','')
df_new['Genre'] = df_new['Genre'].apply(lambda x:' '.join(ast.literal_eval(x)))

Index(['movie_id', 'title', 'overview', 'original_language', 'cast', 'crew',
       'Genre', 'keywords', 'cast4', 'director', 'keyword'],
      dtype='object')

**Dropiing the old cast crew and keywrods column**

In [25]:
## Dropiing the cast crew and keywrods column
df_new = df.drop(['cast','crew','keywords'], axis=1)

In [26]:
df_new.head()

Unnamed: 0,movie_id,title,overview,original_language,Genre,cast4,director,keyword
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,"['Drama', 'Crime']",MarlonBrando AlPacino JamesCaan RobertDuvall,FrancisFordCoppola,italy lossoflovedone loveatfirstsight basedonn...
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,"['Drama', 'Crime']",TimRobbins MorganFreeman BobGunton WilliamSadler,FrankDarabont,prison corruption policebrutality basedonnovel...
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,"['Drama', 'Crime']",AlPacino RobertDuvall DianeKeaton RobertDeNiro,FrancisFordCoppola,italy italianamerican cuba symbolism gangster ...
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,"['Drama', 'History', 'War']",LiamNeeson BenKingsley RalphFiennes CarolineGo...,StevenSpielberg,basedonnovelorbook factory concentrationcamp h...
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,"['Comedy', 'Drama', 'Romance']",ShahRukhKhan Kajol AmrishPuri AnupamKher,AdityaChopra,


**Now to create tags we will concatenate the *overview , crew, cast, keywords and genre* column into a single column called _full_info_**

In [51]:
## Concatenating all text columns together 
df_new['full_info'] = df_new['overview'] + ' ' +df_new['original_language']+ ' '+ df_new['Genre'] + ' '+ df_new['cast4'] + ' ' + df_new['director']+ ' ' + df_new['keyword']

In [34]:
## Dropping the 5 nan values
df_new.dropna(inplace=True)

In [35]:
df_new.head()

Unnamed: 0,movie_id,title,overview,original_language,Genre,cast4,director,keyword,full_info
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,Drama Crime,MarlonBrando AlPacino JamesCaan RobertDuvall,FrancisFordCoppola,italy lossoflovedone loveatfirstsight basedonn...,"Spanning the years 1945 to 1955, a chronicle o..."
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,Drama Crime,TimRobbins MorganFreeman BobGunton WilliamSadler,FrankDarabont,prison corruption policebrutality basedonnovel...,Framed in the 1940s for the double murder of h...
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,Drama Crime,AlPacino RobertDuvall DianeKeaton RobertDeNiro,FrancisFordCoppola,italy italianamerican cuba symbolism gangster ...,In the continuing saga of the Corleone crime f...
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,Drama History War,LiamNeeson BenKingsley RalphFiennes CarolineGo...,StevenSpielberg,basedonnovelorbook factory concentrationcamp h...,The true story of how businessman Oskar Schind...
5,129,Spirited Away,"A young girl, Chihiro, becomes trapped in a st...",ja,Animation Family Fantasy,RumiHiiragi MiyuIrino MariNatsuki TakashiNaito,HayaoMiyazaki,witch parentchildrelationship magic darkness b...,"A young girl, Chihiro, becomes trapped in a st..."


**Exporting the cleaned dataset for later use**

In [1]:
df_new.to_csv('cleaned_movie5_csv.csv',index=False)

NameError: name 'df_new' is not defined

## Text Preprocessing<br>
Here we will do the basic text preprocessing on the full_info column which have following steps:<br>
- Lowercasing.
- Removing html tags if any.
- Tokenizing using the word tokenizer from nltk.corpus
- Removing the stopwords and punctuations from the strings
- stemming using the Porter Stemmer

In [2]:
df_new = pd.read_csv('./cleaned_movie_csv.csv')

In [3]:
df_new.head()

Unnamed: 0,movie_id,title,overview,original_language,Genre,cast4,director,keyword,full_info
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,Drama Crime,MarlonBrando AlPacino JamesCaan RobertDuvall,FrancisFordCoppola,italy lossoflovedone loveatfirstsight basedonn...,"Spanning the years 1945 to 1955, a chronicle o..."
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,Drama Crime,TimRobbins MorganFreeman BobGunton WilliamSadler,FrankDarabont,prison corruption policebrutality basedonnovel...,Framed in the 1940s for the double murder of h...
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,en,Drama Crime,AlPacino RobertDuvall DianeKeaton RobertDeNiro,FrancisFordCoppola,italy italianamerican cuba symbolism gangster ...,In the continuing saga of the Corleone crime f...
3,424,Schindler's List,The true story of how businessman Oskar Schind...,en,Drama History War,LiamNeeson BenKingsley RalphFiennes CarolineGo...,StevenSpielberg,basedonnovelorbook factory concentrationcamp h...,The true story of how businessman Oskar Schind...
4,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",hi,Comedy Drama Romance,ShahRukhKhan Kajol AmrishPuri AnupamKher,AdityaChopra,,"Raj is a rich, carefree, happy-go-lucky second..."


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df_new['full_info']=df_new['full_info'].str.lower()

In [6]:
def preprocessor(text):
    ## removing html tags
    
    
    text = re.sub('<.*?>','',text)
    
    ## Tokenizing 
    
    
    text = word_tokenize(text)
    
    ## Punctuation removal and stopwords removal
    
 
    y=[]
    for token in text:
        if token not in string.punctuation and token not in stopwords.words('english'):
            y.append(token)
    text = y[:]
    y.clear()
    
    ## Stemming the words to thier base form
    
    
    for token in text:
        y.append(ps.stem(token))
    
    return ' '.join(y) 

In [7]:
## Applying the preprocessor function which removes html tags, tokenizes , removes stopwords and punctuaions and stemming


df_new['full_info_processed'] = df_new['full_info'].apply(preprocessor)

## Final Dataset for modelling<br>

**We will be taking following columns for model building**
- movie_id
- title
- genre
- full_info_processed

In [8]:
df_new.columns

Index(['movie_id', 'title', 'overview', 'original_language', 'Genre', 'cast4',
       'director', 'keyword', 'full_info', 'full_info_processed'],
      dtype='object')

In [9]:
df_final = df_new[['movie_id','title','Genre','full_info_processed']]

###  Vectorization of the full_info_processed column<br>
**We will be using two vectorization techniques, BoW (bag of words) and TF-IDF**

In [10]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer(max_features=10000)

In [11]:
## Applying the Bag of Words with max features(or words) = 10000

X = cv.fit_transform(df_final['full_info_processed']).toarray()

In [12]:
print('Memory usage by array X:', X.nbytes/(1024*1024), 'MB')

Memory usage by array X: 762.5579833984375 MB


**The vocablury has 10000 words in it**

In [13]:
len(cv.get_feature_names())

10000

In [14]:
X.shape

(9995, 10000)

**Each movie has now become a vector of lenghth 10000. Hence as we have 9995 movies, so we have 9995 vectors of length 10000.
Such movies can be represented as vector in 10000 dimensional space.**

#  Recommendation Strategy:

Now the description of each movie has now become a vector in 48516 dimensional space. So the basic idea is that the movies which are similar to each other will have similar types of vectors associated with them and these vectors will be very close to each other. So for a given movie( or product in genneral) recommnedation just simply picks up these vectors close to this movie and recommend the corresponding movie.<br>
Now the tool to measures the closeness of these vectors can be:<br>
1. Calculate the Euclidean distance: $$\sqrt((x_0 - x)^2 + (y_0 - y)^2)$$ .The problem with this is that in high dimensions sparsity is created and hence eucleaden distance does not become a good measure for closeness.<br>
2. Calculating the angle between the two vectors **(cosine similarity) (0.0-1.0)** as it is not affected by sparsity of the high dimensions. We will use the cosine similarity in this case to find the close vectors.<br>

### Cosine Similarity:<br>
Cosine similarity is a metric used to measure the similarity of two vectors. Specifically, it measures the similarity in the direction or orientation of the vectors ignoring differences in their magnitude or scale.<br>
Mathematically it calculates the cosine of the smallest angle between two vectors.<br>
For two vectors x and y, the similarity between them is calculated as:
                        $$similarity(X,Y) = cosine(\theta) = \frac{X*Y}{||X||*||Y||}$$
 where $$\theta$$ is the smallest angle between X and Y
<img src = "cosine-similarity-vectors.jpg">
1. Two similar movies will have vectors pointing in almost same direction and hence the similarity between them is more and the cosine of angle between them is closer to 1.
2. Two movies which are not similar at all will have vectors almost orthogonal to each other and hence the similarity between them is zero and the cosine of 90 = 0.

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

**EXAMPLE for cosine similarity** <br>
Now two vectors a = [[1,2,3]] and b= [[10,20,30]] which are similar have cosine similarity of 1.0. As these vectors point in the same directions.

In [16]:
a = [[1,2,3]]
b = [[10,20,30]]
print('The cosine similarity between a and b is:',cosine_similarity(a,b)[0][0])

The cosine similarity between a and b is: 1.0


In [17]:
cosine_similarity_array = cosine_similarity(X)

In [32]:
print('Memory usage by array X:', cosine_similarity_array.nbytes/(1024*1024), 'MB')

Memory usage by array X: 762.1767044067383 MB


### Interpretation of cosine_similarity_array

Now we will calculate the cosine similarity of each movie with every other movie, i.e. in this case for a movie ,9994 cosine similarities will be calcuated, and as there are 9995 movies hence there will be **9995*9995 = 99900025** similarities will be calculated.<br>
<br>
So we will get an array of shape (9995,9995), in which the diagonal elements will be equal to 1 as the cosine similarity of each movie with itself will be 1. while the rest of the elements in the row will give the similarity between the movie with rest of the movies.

## Recommendation function:<br>
This function takes in a movie name, then finds the top 15 most similar movies by sorting the cosine distance array for the given movie, and then randoml picks up 6 movies from these 15 movies and recommends it.<br>
Function Breakdown<br>
1. Function takes in the movie name as input.
2. Next we retrieve the corresponding index for the movie.
3. Now this index correponds to the row index for this movie in the cosine similarity array.
4. we will sort the array ind ecending to get the top 10 most similar movies. We get the indexes for movies using enumerate.
5. We display the 6 movies randomly from 10 movies and thier genres as well.

In [18]:
df_final.columns

Index(['movie_id', 'title', 'Genre', 'full_info_processed'], dtype='object')

In [157]:
## The recommendation Function
def recommend(movie):
    ## retrieving the corresponding index for the movie.
    movie_idx = df_final[df_final['title'] == movie].index[0]
    
    ## Getting the top 10 most similar movies. We get the indexes for movies using enumerate.
    no_of_similar_movies = 10
    movies = sorted(list(enumerate(cosine_similarity_array[movie_idx])),reverse=True,key=lambda x:x[1])[1:no_of_similar_movies]
    print(movies)
    ## display the 6 movies randomly from 10 movies and thier genres as well.
    for i in random.sample(movies,6):
        print('Movie Name:',df_final.iloc[i[0],1])
        print('id:', df_final.iloc[i[0],0])
        print('Genre :',df_final.iloc[i[0],2])
        #print(df_final.iloc[i[0],2])
        print()

In [21]:
## The recommendation list Function
def recommend_list(movie):
    # retrieving the corresponding index for the movie.
    movie_idx = df_final[df_final['title'] == movie].index[0]

    # Getting the top 10 most similar movies. We get the indexes for movies using enumerate.
    no_of_similar_movies = 10
    movies = sorted(list(enumerate(cosine_similarity_array[movie_idx])), reverse=True, key=lambda x: x[1])[
             1:no_of_similar_movies]
    recommended_movies = []
    movie_trailers = []
    movie_posters = []
    # display the 6 movies randomly from 10 movies and thier genres as well.

    for movie_idx_sim_tuple in random.sample(movies, 6):
        movie_idx = movie_idx_sim_tuple[0]
        # Creating a list of recommended movies and  their genres
        recommended_movies.append((df_final.loc[movie_idx, 'title'], df_final.loc[movie_idx, 'Genre']))

        # Creating a list of movie trailers
        movie_trailers.append(trailer(df_final.loc[movie_idx, 'movie_id']))

        # Creating a list of movie posters
        movie_posters.append(poster(df_final.loc[movie_idx, 'movie_id']))
    return recommended_movies, movie_trailers, movie_posters

In [19]:
## Trailer function (imports the corresponding youtube trailers of the recommended movies)
def trailer(movie_id):
    import requests
    web = f'https://api.themoviedb.org/3/movie/{movie_id}/videos?api_key=d580ed432d7d56f0eb97549c4fcb2273&language=en-US'
    response = requests.get(web)
    try:
        response.json()['results']
        trailer_df = pd.DataFrame(response.json()['results'])
        trailer_df['type'] = trailer_df['type'].str.lower()
        trailer_id= trailer_df[trailer_df['type']== 'trailer'].sample()['key'].values[0]
        link = 'www.youtube.com/watch?v={}'.format(trailer_id)
        return link
    except:
        try:
            time.sleep(1)
            response.json()['results']
            trailer_df = pd.DataFrame(response.json()['results'])
            trailer_df['type'] = trailer_df['type'].str.lower()
            trailer_id= trailer_df[trailer_df['type']== 'trailer'].sample()['key'].values[0]
            link = 'www.youtube.com/watch?v={}'.format(trailer_id)
            return link
        except:
            False

In [20]:
## fetches the poster of the movie using the movie ID
def poster(movie_id):
    import requests
    web = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key=d580ed432d7d56f0eb97549c4fcb2273&language=en-US'
    response = requests.get(web)
    try:
        return f"https://image.tmdb.org/t/p/original/{response.json()['poster_path']}"
    except:
        try:
            time.sleep(1)
            return f"https://image.tmdb.org/t/p/original/{response.json()['poster_path']}"
        except:
            return False

## Testing the recommender.

In [22]:
recommend_list("The Godfather")

([('Hangman', 'Mystery Crime Thriller'),
  ('Extremely Wicked, Shockingly Evil and Vile', 'Crime'),
  ('Shottas', 'Action Adventure Crime Drama'),
  ('Bomb City', 'Drama Crime'),
  ('I Am All Girls', 'Crime Drama Mystery Thriller'),
  ('Gotti', 'Crime Drama History Thriller')],
 ['www.youtube.com/watch?v=bPrd2eWCgjc',
  'www.youtube.com/watch?v=mdMtnvMJcDA',
  'www.youtube.com/watch?v=5tpClYOE-Wk',
  'www.youtube.com/watch?v=ir4IraOtads',
  'www.youtube.com/watch?v=zMPS9JO0p6w',
  'www.youtube.com/watch?v=m290GmN-Q7Q'],
 ['https://image.tmdb.org/t/p/original//iEpxKeX7SE3aLm9dTRIuk7o2kDA.jpg',
  'https://image.tmdb.org/t/p/original//zSuJ3r5zr5T26tTxyygHhgkUAIM.jpg',
  'https://image.tmdb.org/t/p/original//chh2xSTVtMCFyFePm1zRbBmtqaX.jpg',
  'https://image.tmdb.org/t/p/original//kho48mWJ5xuiLn5bpuIvO6ZP7ep.jpg',
  'https://image.tmdb.org/t/p/original//voQLcqp0ehF3md93hMpELCRxTDv.jpg',
  'https://image.tmdb.org/t/p/original//q869M8MTy0eynwfiE5vIlFgAUze.jpg'])

In [179]:
Trailers[0]

'www.youtube.com/watch?v=bPrd2eWCgjc'

In [31]:
recommend("The Lord of the Rings: The Two Towers")

Movie Name: The Hobbit: An Unexpected Journey
Genre : Adventure Fantasy Action

Movie Name: The Lord of the Rings: The Fellowship of the Ring
Genre : Adventure Fantasy Action

Movie Name: The Hobbit: The Battle of the Five Armies
Genre : Action Adventure Fantasy

Movie Name: The Hobbit: The Desolation of Smaug
Genre : Fantasy Adventure Action

Movie Name: Warcraft
Genre : Action Adventure Fantasy

Movie Name: The Hobbit
Genre : Family Fantasy Animation Adventure TVMovie



## using tf-idf

In [39]:
tf = TfidfVectorizer()

In [40]:
X_tf = tf.fit_transform(df_final['full_info_processed']).toarray()

In [41]:
len(tf.get_feature_names())

48516

In [42]:
cosine_distance_array_tf = cosine_similarity(X)

In [52]:
def recommend_tf(movie):
    movie_idx = df_final[df_final['title'] == movie].index[0]
    no_of_similar_movies = 10
    movies = sorted(list(enumerate(cosine_distance_array_tf[movie_idx])),reverse=True,key=lambda x:x[1])[1:no_of_similar_movies]
    
    
    for i in random.sample(movies,6):
        print('Movie Name:',df_final.iloc[i[0],1])
        print('Genre :',df_final.iloc[i[0],2])
        #print(df_final.iloc[i[0],2])
        print()

In [30]:
recommend_tf('Harry Potter and the Prisoner of Azkaban')

NameError: name 'recommend_tf' is not defined

In [78]:
import pickle

In [82]:
pickle.dump(cosine_similarity_array,open('cos_sim.pkl','wb'))
pickle.dump(df_final,open('df_final.pkl','wb'))

In [49]:
df_final.head()

Unnamed: 0,movie_id,title,Genre,full_info_processed
0,238,The Godfather,Drama Crime,span year 1945 1955 chronicl fiction italian-a...
1,278,The Shawshank Redemption,Drama Crime,frame 1940 doubl murder wife lover upstand ban...
2,240,The Godfather Part II,Drama Crime,continu saga corleon crime famili young vito c...
3,424,Schindler's List,Drama History War,true stori businessman oskar schindler save th...
4,19404,Dilwale Dulhania Le Jayenge,Comedy Drama Romance,raj rich carefre happy-go-lucki second gener n...


In [51]:
df_final.head(50)

Unnamed: 0,movie_id,title,Genre,full_info_processed
0,238,The Godfather,Drama Crime,span year 1945 1955 chronicl fiction italian-a...
1,278,The Shawshank Redemption,Drama Crime,frame 1940 doubl murder wife lover upstand ban...
2,240,The Godfather Part II,Drama Crime,continu saga corleon crime famili young vito c...
3,424,Schindler's List,Drama History War,true stori businessman oskar schindler save th...
4,19404,Dilwale Dulhania Le Jayenge,Comedy Drama Romance,raj rich carefre happy-go-lucki second gener n...
5,129,Spirited Away,Animation Family Fantasy,young girl chihiro becom trap strang new world...
6,667257,Impossible Things,Family Drama,matild woman death husband man constantli abus...
7,372058,Your Name.,Romance Animation Drama,high schooler mitsuha taki complet stranger li...
8,389,12 Angry Men,Drama,defens prosecut rest juri file juri room decid...
9,496243,Parasite,Comedy Thriller Drama,unemploy ki-taek 's famili take peculiar inter...
