# Content Based Recommendation System

### Importing necessary libraries

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')



### Importing the dataset

In [None]:
df=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\credits.csv') 
df1=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\keywords.csv') 
df2=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\links.csv') 
df3=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\links_small.csv') 
df4=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\movies_metadata.csv') 
df5=pd.read_csv('C:\\Users\\ADMIN\\Desktop\\Movie_data\\ratings_small.csv') 
keywords = df1
links = df2
links_small = df3
ratings_small = df5
movies_metadata = df4
movies_metadata = movies_metadata.head(30000)
movies_metadata['index'] = movies_metadata.index
credits = df
credits['id'] = (credits['id']).astype('str')
credits = credits.head(30000)



### Data Exploration

In [None]:
# Extracting the feautures needed for building a model. Others are not neccesary.
data = movies_metadata.loc[:,['title','id','genres','original_language','release_date','overview']]

# Converting both id columns into str for the merge operation. 
# Merge operation requires the 2 id columns to be of same datatype
data['id'] =data['id'].astype('str')

In [2]:
# df_final is a dataset made with id,genres,cast,crew,release date,language as these are feautures that
# help in content based recommendations
df_final =  ( pd.merge(left=data, right=credits, left_on='id', right_on='id') )
df_final

Unnamed: 0,title,id,genres,original_language,release_date,overview,cast,crew
0,Toy Story,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,1995-10-30,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,Jumanji,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,1995-12-15,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,Grumpier Old Men,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,1995-12-22,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,Waiting to Exhale,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,1995-12-22,"Cheated on, mistreated and stepped on, the wom...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,Father of the Bride Part II,11862,"[{'id': 35, 'name': 'Comedy'}]",en,1995-02-10,Just when George Banks has recovered from his ...,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...,...,...
30037,Deep Web,321769,"[{'id': 99, 'name': 'Documentary'}]",en,2015-03-15,Deep Web gives the inside story of one of the ...,"[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '54d16da8c3a3687600000692', 'de..."
30038,Fugly,281730,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,2014-06-13,"It is a story of 4 friends - Dev, Gaurav, Adit...","[{'cast_id': 0, 'character': 'R.S Chautala', '...","[{'credit_id': '53ec8ee9c3a3682abb000b1e', 'de..."
30039,Ginza Cosmetics,118575,"[{'id': 18, 'name': 'Drama'}]",ja,1951-04-14,Italy is usually cited as the anchor of the ne...,"[{'cast_id': 2, 'character': 'Yukiko Tsuji', '...","[{'credit_id': '52fe4bdec3a36847f8217305', 'de..."
30040,"On Any Sunday, The Next Chapter",294544,"[{'id': 99, 'name': 'Documentary'}]",en,2014-01-01,"On Any Sunday, The Next Chapter” is an explora...","[{'cast_id': 5, 'character': '', 'credit_id': ...","[{'credit_id': '545b6432c3a36853530012be', 'de..."


### Missing Value treatment

In [3]:
# Replacing the null values with null string

for feature in df_final:
  df_final[feature] =df_final[feature].fillna('') 
movies_data = df_final

### Feature cleaning

In [5]:
# Getting all the stopwords
# Using NLTK library
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
stopwords = []
stopwords = nltk.corpus.stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
# Cleaning the overview feature by removing all the stopwords.
use = movies_data['overview']
nlp_use = []
for i in range(0, len(use)):
    review = re.sub('[^a-zA-Z]', ' ', use[i])
    review = review.lower()
    review = review.split()
    tokens_without_sw = [word for word in review if not word in stopwords]
    review = ' '.join(review)
    review = ""
    for val in tokens_without_sw :
        review = review + val + " "
    nlp_use.append(review) 

In [7]:
# Cleaned overview feature after stopwords removal
movies_data['overview'] = nlp_use
nlp_use

['led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz circumstances separate buzz woody owner duo eventually learns put aside differences ',
 'siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures ',
 'family wedding reignites ancient feud next door neighbors fishing buddies john max meanwhile sultry italian divorc e opens restaurant local bait shop alarming locals worry scare fish away less interested seafood cooking hot time max ',
 'cheated mistreated stepped women holding breath waiting elusive good man break string less stellar lovers friends confidants vannah bernie glo robin talk determined find better way breathe ',
 'george banks recovered daughter wedding receives news pregnant george wife nina e

### Word Vectorization

**TfidVectorizer**:

Used TfidfVectorizer() to convert the raw word documents in to TF-IDF feature matrix.

> TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency. This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

It is a measure of originality of a word by comparing the number of times a word appears in a document with the number of documents the word appears in.

TF-IDF = (Number of times word appears in a doc) + $\log(\frac{n}{df})$

where, n -> no of documents and df -> doc freq of the word

****Cosine Similarity****:

Cosine similarity is a metric, that is used to determine the similarity between data objects(vectors) irespective of their size. 

The formula for Cosine similarity = Cos(x,y) = $\frac{x \times y}{||x|| \times ||y||}$

where, 
- $x \times y$ -> dot product of vectors x and y
- ||x|| and ||y|| -> length of vectors
- $||x|| \times ||y||$ -> cross product of vectors

In [8]:
# Converting the documents in overview feature to a matrix of TF-IDE features using TfidVectorizer()
movies_data['index'] = movies_data.index
combined_features = movies_data['overview']
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

# getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.01556924 0.         ... 0.         0.         0.        ]
 [0.01556924 1.         0.04841193 ... 0.00842387 0.00638193 0.        ]
 [0.         0.04841193 1.         ... 0.         0.03578085 0.        ]
 ...
 [0.         0.00842387 0.         ... 1.         0.         0.01538662]
 [0.         0.00638193 0.03578085 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.01538662 0.         1.        ]]


### Predicting Movie - top 30 movie recommendation

In [9]:
# Getting an input from the user
movie_name = input(' Enter your favourite movie name : ')

# Combining all the titles into a list
list_of_all_titles = movies_data['title'].tolist()

# Getting close matches of the entered movie name using the difflib library.
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

# Getting the index of the close match movie
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

# listing the similarity scores of the movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))

# sorting the similarity scores in descending order i.e highest to lowest
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

# Printing all the 30 suggested movies
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : The karate kid
Movies suggested for you : 

1 . The Karate Kid
2 . The Karate Kid, Part III
3 . The Karate Kid, Part II
4 . Klute
5 . Forever Young
6 . It All Starts Today
7 . Daniel
8 . Fake
9 . Mad Dog Morgan
10 . Android
11 . Mrs. Doubtfire
12 . The Prophecy II
13 . Five Fingers of Death
14 . In July
15 . Santa Claus Has Blue Eyes
16 . Tall Tale
17 . A Force of One
18 . Beautiful
19 . Master of the Flying Guillotine
20 . Taxi 3
21 . Absentia
22 . Daylight
23 . The Dolphin: Story of a Dreamer
24 . Sunday Bloody Sunday
25 . White Fang and the Hunter
26 . The Last Dragon
27 . Martial Club
28 . Dark Skies
29 . The Devil and Daniel Johnston


In [59]:
#to key in the movie you want this is shown
for val in movies_data['title'] :
    print(val)

Toy Story
Jumanji
Grumpier Old Men
Waiting to Exhale
Father of the Bride Part II
Heat
Sabrina
Tom and Huck
Sudden Death
GoldenEye
The American President
Dracula: Dead and Loving It
Balto
Nixon
Cutthroat Island
Casino
Sense and Sensibility
Four Rooms
Ace Ventura: When Nature Calls
Money Train
Get Shorty
Copycat
Assassins
Powder
Leaving Las Vegas
Othello
Now and Then
Persuasion
The City of Lost Children
Shanghai Triad
Dangerous Minds
Twelve Monkeys
Wings of Courage
Babe
Carrington
Dead Man Walking
Across the Sea of Time
It Takes Two
Clueless
Cry, the Beloved Country
Richard III
Dead Presidents
Restoration
Mortal Kombat
To Die For
How To Make An American Quilt
Se7en
Pocahontas
When Night Is Falling
The Usual Suspects
Guardian Angel
Mighty Aphrodite
Lamerica
The Big Green
Georgia
Kids of the Round Table
Home for the Holidays
The Postman
The Confessional
The Indian in the Cupboard
Eye for an Eye
Mr. Holland's Opus
Don't Be a Menace to South Central While Drinking Your Juice in the Hood
Two 