> Dataset [Here](https://www.kaggle.com/rounakbanik/the-movies-dataset/data)

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/movies_metadata.csv', low_memory=False)
data.head(n=2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


### First, we'll use make recommendation based of the popularity of the movies

#### For this purpose the formula used as metric is IMDB weighted rating formula :
    * Weighted_Rating = (v/(v + m)*R) + (m/(m+v)*C)
    where 
        -v is the votes for movies
        -m is the minimum votes required
        -R average rating of the movie
        -C the mean vote (include all movies)

In [3]:
C = data['vote_average'].mean()
m = data['vote_count'].quantile(0.9)
print(C, m)

5.618207215133889 160.0


In [4]:
generative_content = data.copy().loc[data['vote_count'] >= m]
generative_content.shape

(4555, 24)

* Then 4555 movies selected to be filtering for the next

In [5]:
def weighted_rating(x, m=m, C=C):
    v, R = x['vote_count'], x['vote_average']
    
    return (v/(v + m)*R) + (m/(m+v)*C)

In [6]:
generative_content['score'] = generative_content.apply(weighted_rating, axis=1)
generative_content.sort_values('score', ascending=False, inplace=True)
generative_content[['title', 'vote_count', 'vote_average', 'score']].head()

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385


#### Then we can recommend this sorted list of movies for all users

# Content based recommendation

### According to a movie we'll recommender similar videos
* 1- based on the overview

In [7]:
data['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [8]:
# for sake of memory we'll use 15000 data

data = data.copy().loc[:15000]

In [9]:
# Since we work with text data, we have to make a words vectors

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
data['overview'] = data['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(data['overview'])
tfidf_matrix.shape

(15001, 40226)

In [10]:
tfidf.get_feature_names()[1000:1010]

['adversities',
 'adversity',
 'advertisement',
 'advertises',
 'advertising',
 'advice',
 'advised',
 'adviser',
 'advisers',
 'advises']

In [11]:
# Let's compute similarity
from sklearn.metrics.pairwise import linear_kernel

cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarity.shape

(15001, 15001)

In [12]:
cosine_similarity[5]

array([0.        , 0.05243536, 0.        , ..., 0.        , 0.02958675,
       0.        ])

In [13]:
movies_indices = pd.Series(data.index, index=data['title']).drop_duplicates()
movies_indices[:5]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [14]:
def get_recommendations(title, cosine_similarity=cosine_similarity):
    index  = movies_indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[index]))
    
    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 50 scores
    similarity_scores = similarity_scores[1:51] #we remove the self.movie, position 0
    
    movie_indices = [i[0] for i in similarity_scores]
    
    return data.title.iloc[movie_indices]
    

In [15]:
recommendations = get_recommendations('Father of the Bride Part II')
recommendations.head()

6793     Father of the Bride
6571                   Kuffs
6306         North to Alaska
5005                 Wendigo
13611    The Magic of Méliès
Name: title, dtype: object

### we can use this recommandations to filter out the popular movies to show first to the user

* 1- using the global popular movies scores

In [16]:
populars_movies_recommended = list(set(generative_content.index).intersection(set(recommendations.index)))
populars_movies_recommended

[1280, 6793, 13420, 1516, 4112, 2900, 2008, 14489, 5819, 926]

In [17]:
data[['title', 'vote_count', 'vote_average']].iloc[populars_movies_recommended]

Unnamed: 0,title,vote_count,vote_average
1280,The Amityville Horror,192.0,6.2
6793,Father of the Bride,355.0,6.2
13420,Funny People,398.0,5.6
1516,George of the Jungle,508.0,5.4
4112,Blow,1352.0,7.4
2900,Creepshow,228.0,6.7
2008,Nineteen Eighty-Four,311.0,6.8
14489,A Single Man,475.0,7.3
5819,Two Weeks Notice,505.0,5.9
926,It's a Wonderful Life,1103.0,8.0


* 2- using the scores considering the recommenders movies as global

In [18]:
recommendations = data[['title', 'vote_count', 'vote_average']].iloc[recommendations.index]
recommendations['score'] = recommendations.apply(weighted_rating, axis=1)
recommendations.sort_values('score', ascending=False, inplace=True)
recommendations.head()

Unnamed: 0,title,vote_count,vote_average,score
926,It's a Wonderful Life,1103.0,8.0,7.698269
4112,Blow,1352.0,7.4,7.21145
14489,A Single Man,475.0,7.3,6.876241
2008,Nineteen Eighty-Four,311.0,6.8,6.398542
2900,Creepshow,228.0,6.7,6.2539
