In [45]:
#pandas for datastructure
import pandas as pd

#for numerical computating
import numpy as np

#for statistics
from scipy import stats

#Plotting Libraries
import matplotlib.pyplot as okt
import seaborn as sns
%matplotlib inline

#If we have String that contains strings floats etc. we can use it to evaluate its
#raises Exception if input is not of valid python data-type
from ast import literal_eval

#Transform Text into meaningful representation of numbers, countVectorizer count word frequency
#Tfid - Term Frequency Inverse Document  
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#Compute cosine similarity between samples in X and Y
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

#steming algo - process of removing morphological barriers of use of words.(chocolate,chocolatey...etc... root word=chocolate)
from nltk.stem.snowball import SnowballStemmer

#Lemmetization - generate root form of the inflicted word.(running's root word = run), converts word to its dictonary form
from nltk.stem.wordnet import WordNetLemmatizer

#dictonary for English Language
from nltk.corpus import wordnet

#Issue warnings
import warnings; warnings.simplefilter('ignore')

In [46]:
mdata = pd.read_csv('C:\\Users\\Harsha\\OneDrive\\AI-ML Projects\\archive (1)\\movies_metadata.csv')
mdata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## We used a IMDB's weighted rating formula to construct the chart it is represented as follows :

#### Weighted rating(WR)= **(v/v+m . R) + (m/v+m . C)**

##### where : 

##### - v is the number of votes for the movie
##### - m is the min. votes required to be listed in the chart
##### - R is the average rating of the movie
##### - C is the mean vote across the whole report



## Simple Recommender - Movie Recommendation Based on `vote_average` and `vote_count` columns

In [47]:
C = mdata['vote_average'].mean()
print(C)

5.618207215134185


In [48]:
#Caculate min. no. of votes required to be in chart
m=mdata['vote_count'].quantile(0.90)
print(m)

160.0


In [49]:
#Filter out all qualified movies into a new DataFrame
q_movies=mdata.copy().loc[mdata['vote_count']>=m]
q_movies.shape

(4555, 24)

In [50]:
def weighted_rating(X):
    V = X['vote_count']
    R= X['vote_average']
    return (V/(V+m)*R) + (m/(m+V)*C)

In [51]:
#Define new feature 'score' and calculate its value using 'weighted_rating()'
q_movies['score']=q_movies.apply(weighted_rating,axis=1)

In [52]:
#sort movies based on score calculated above
q_movies = q_movies.sort_values('score',ascending=False)

#print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


## Content Based Recommender

#### Content based Recommenders based on:

 - Movie Overviews and Tagines
 - Movie Cast, Crew , Keywords and Genre

In [65]:
links_small = pd.read_csv('C:\\Users\\Harsha\\OneDrive\\AI-ML Projects\\archive (1)\\links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [54]:
#After EDA(Experimental Data-Analysis)
mdata=mdata.drop([19730, 29503, 35587],errors='ignore')

In [55]:
mdata['id']=mdata['id'].astype('int')

In [56]:
smd = mdata[mdata['id'].isin(links_small)]
smd.shape

(9099, 24)

In [57]:
smd['tagline']=smd['tagline'].fillna('')
smd['description']=smd['overview'] + smd['tagline']
smd['description']=smd['description'].fillna('')

In [58]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [59]:
tfidf_matrix.shape

(9099, 268124)

### Cosine Similarity

### **cosine(x,y)**= x.y / ||x|| . ||y||

In [60]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [61]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [62]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])


In [63]:
def get_recommendations(title):
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores, key=lambda x: x[1],reverse=True)
    sim_scores=sim_scores[1:31]
    movie_indices=[i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [64]:
get_recommendations('Batman Forever').head(15)

7931                      The Dark Knight Rises
2579               Batman: Mask of the Phantasm
6900                            The Dark Knight
6144                              Batman Begins
8165    Batman: The Dark Knight Returns, Part 1
524                                      Batman
1240                             Batman & Robin
1113                             Batman Returns
7565                 Batman: Under the Red Hood
7901                           Batman: Year One
8227    Batman: The Dark Knight Returns, Part 2
681                         Eyes Without a Face
6206                                   Cry_Wolf
1135                   Night Falls on Manhattan
2075                             Open Your Eyes
Name: title, dtype: object