# Recommendation System Development

## Import

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import json
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

## Load Base Datasets

In [13]:
filepath = './datasets/movies_dataset.csv'

df_movies = pd.read_csv(filepath)

filepath = './datasets/credits.csv'

df_credits = pd.read_csv(filepath)


  df_movies = pd.read_csv(filepath)


## Data Processing 

The following steps look to prepate the data for the Recommendation system

### Merging Datasets

In [3]:
df_movies = df_movies[df_movies['id'].apply(lambda x: x.isnumeric())]

In [4]:
# try to cast id column as int and drop rows with incorrect values
df_movies['id'] = df_movies['id'].astype(int)

df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  object 
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  object 
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  int64  
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  object 
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

In [5]:
movies = df_movies.merge(df_credits, on='id')

movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


### Dropping Columns that won't be used

Now, I will get rid of the columns that wont be used for the recommendation system.

Basically, most of the columns wont provide valuable information to estimate the similarity between two rows or movies.

The following columns will be used:

* id: to identify the movie
* genre: Provides information about the main genres that the movie presents.
* Collection: Because, it can be expected that movies from the same collection are going to be similar.
* Overview: Because provides information regarding the plot of the movie.
* Cast: Because provides information regarding the main actors that have a role in the movie.
* Crew/Director: Usually, most directors have a unique style that may be present in most of their movies.
* Production Company: Generally, the production companies have their own approach and unique style when producing movies.



In [6]:
# Dropping movies that are not released yet
movies = movies.dropna(subset=['release_date'])

movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45451 entries, 0 to 45537
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45451 non-null  object 
 1   belongs_to_collection  4497 non-null   object 
 2   budget                 45451 non-null  object 
 3   genres                 45451 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45451 non-null  int64  
 6   imdb_id                45437 non-null  object 
 7   original_language      45440 non-null  object 
 8   original_title         45451 non-null  object 
 9   overview               44510 non-null  object 
 10  popularity             45451 non-null  object 
 11  poster_path            45112 non-null  object 
 12  production_companies   45451 non-null  object 
 13  production_countries   45451 non-null  object 
 14  release_date           45451 non-null  object 
 15  re

In [7]:
movies = movies[['id',
                'belongs_to_collection',
                'title',
                'overview',
                'genres',
                'production_companies',
                'cast',
                'crew']]

#### Getting genres values

In [8]:
# Get the Name of the genres in the 'Genres' columns
def convert_genre(text):
    List_genres = []
    for i in ast.literal_eval(text):
        List_genres.append(i['name'])
    return List_genres

movies['genres'] = movies['genres'].apply(convert_genre)
movies.head()

Unnamed: 0,id,belongs_to_collection,title,overview,genres,production_companies,cast,crew
0,862,"{'id': 10194, 'name': 'Toy Story Collection', ...",Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",[{'name': 'Twentieth Century Fox Film Corporat...,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,"{'id': 96871, 'name': 'Father of the Bride Col...",Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


#### Getting Collection

In [9]:
# Get the Name of the Collection if exists
def convert_collection(text):
    if isinstance(text, str):
        collections = []
        column_value =  ast.literal_eval(text)
        collections.append(column_value['name'])
        return collections
    else:
        return []

movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(convert_collection)
movies.head()

Unnamed: 0,id,belongs_to_collection,title,overview,genres,production_companies,cast,crew
0,862,[Toy Story Collection],Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,[],Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,[Grumpy Old Men Collection],Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,[],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",[{'name': 'Twentieth Century Fox Film Corporat...,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,[Father of the Bride Collection],Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


#### Getting Production company

In [10]:
# Get the most important production companie
def convert_company(text):
    production_company = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 1:
            production_company.append(i['name'])
        counter+=1
    return production_company

movies['production_companies'] = movies['production_companies'].apply(convert_company)

movies.head()

Unnamed: 0,id,belongs_to_collection,title,overview,genres,production_companies,cast,crew
0,862,[Toy Story Collection],Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[Pixar Animation Studios],"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,[],Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",[TriStar Pictures],"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,[Grumpy Old Men Collection],Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",[Warner Bros.],"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,[],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",[Twentieth Century Fox Film Corporation],"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,[Father of the Bride Collection],Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],[Sandollar Productions],"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


#### Getting Top 3 Actors

In this case, only the top 3 more relevant actors of each movie are considered.

In [11]:
# GEt top 3 actors

def Top3_actor(text):
    Actors = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            Actors.append(i['name'])
        counter+=1
    return Actors

movies['cast'] = movies['cast'].apply(Top3_actor)
movies.head()

Unnamed: 0,id,belongs_to_collection,title,overview,genres,production_companies,cast,crew
0,862,[Toy Story Collection],Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[Pixar Animation Studios],"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,[],Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",[TriStar Pictures],"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,[Grumpy Old Men Collection],Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",[Warner Bros.],"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,[],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",[Twentieth Century Fox Film Corporation],"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,[Father of the Bride Collection],Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],[Sandollar Productions],"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


#### Getting Director

Many members of the crew are relevant, but the Director is the more relevant crew member to define the approach and unique style of the movie.

In [12]:
# Get the director
def fetch_director(text):
    director = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            director.append(i['name'])
    return director

movies['crew'] = movies['crew'].apply(fetch_director)
movies.head()

Unnamed: 0,id,belongs_to_collection,title,overview,genres,production_companies,cast,crew
0,862,[Toy Story Collection],Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[Pixar Animation Studios],"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,[],Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",[TriStar Pictures],"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston]
2,15602,[Grumpy Old Men Collection],Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",[Warner Bros.],"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,[],Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",[Twentieth Century Fox Film Corporation],"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,[Father of the Bride Collection],Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],[Sandollar Productions],"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]


### Creating a TAG column

The above defined columns are collapsed into a single string with all the relevant tags that define the characteristics of each movie

In [13]:
# Collapse everything in one column
def clean_spaces(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

movies = movies.dropna()
movies['cast'] = movies['cast'].apply(clean_spaces)
movies['production_companies'] = movies['production_companies'].apply(clean_spaces)
movies['crew'] = movies['crew'].apply(clean_spaces)
movies['genres'] = movies['genres'].apply(clean_spaces)
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(clean_spaces)
movies['overview'] = movies['overview'].apply(lambda x:x.split())

movies['tags'] = movies['overview'] + movies['genres'] + movies['belongs_to_collection'] \
                + movies['cast'] + movies['crew'] + movies['production_companies']
df = movies.drop(columns=['overview','genres','belongs_to_collection','cast','crew', 'production_companies'])
#new.head()
df['tags'] = df['tags'].apply(lambda x: " ".join(x))
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['cast'] = movies['cast'].apply(clean_spaces)


Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['belongs_to_collection'] \
                + movies['cast'] + movies['crew'] + movies['production_companies']
df = movies.drop(columns=['overview','genres','belongs_to_collection','cast','crew', 'production_companies'])
#new.head()
df['tags'] = df['tags'].apply(lambda x: " ".join(x))
df.head()


Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


## Vectorization

### Imports

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer


### Normalize Tags strings

In order to normalize the data in the 'Tag' column, firstly all string are normalized to lowercase and then use the PorterStemmer to take words to their root.
For example, Lover --> lover --> lov (root of loving, love, loved, etc.)

In [15]:
#Normalize to lowercase
df['tags'] = df['tags'].apply(lambda x : x.lower())

#Reduce words to its family
ps = PorterStemmer()

def stem(text):
  y = []

  for i in text.split():
    y.append(ps.stem(i))

  return " ".join(y)

df['tags'] = df['tags'].apply(stem)




### Vectorization

Now, each movie is represented in a n(5000) dimentional space as a vector. The aim of this representation is to then, use this numeric vectorization to calculate the distance between differnt movies. 
Knowing the distance among movies, it's possible to define the similarity among movies and, then, obtain the TOP 5 movies most similary movies to each one.

In [16]:
# Vectorize the tags column
cv = CountVectorizer(max_features = 5000,
                     stop_words = 'english')
vectors = cv.fit_transform(df['tags']).toarray()

In [17]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [None]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Cosine Similarity: In this case, I'm going to be using Cosine similarity instead of euclidean distance because according to some studies, the latter is not an accurate distance representation in high dimentional spaces.

In [16]:
similarity = cosine_similarity(vectors)

In [26]:
similarity.shape

(44510, 44510)

## Recommend Function

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [18]:
filepath = '/content/drive/MyDrive/Notebooks/Henry/PI/MLOps/datasets/'

df.to_csv(filepath+'df_recommendation.csv', index=False)

In [14]:
def recommend(movie):
    index = df[df['title'] == movie].index[0]
    print(index)
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [15]:
recommend('Toy Story')

0
Dibu 3
Walk Like a Man
Awakenings
Operation 'Happy New Year'!
The Master


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import sklearn
from sklearn.decomposition import PCA
import numpy as np

In [2]:
filepath = './datasets/'

In [3]:
df_recommendation = pd.read_csv(filepath+'df_recommendation.csv')

In [4]:
# Vectorize the tags column
cv = CountVectorizer(max_features = 5000,
                     stop_words = 'english')
vectors = cv.fit_transform(df_recommendation['tags']).toarray()

In [24]:
cv.get_feature_names_out()


array(['000', '10', '100', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [2]:
filepath = './datasets/'
# Load pickle file with similarity matrix
with open(filepath+"vectors.pkl","rb") as f:
    vectors = pickle.load(f)

vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
#del(similarity)
similarity = cosine_similarity(vectors)

In [26]:
similarity.shape

(44510, 44510)

In [27]:
similarity_argsort = np.argsort(similarity[0])
similarity_argsort[-6:]

array([30630, 33766,  2988,  7206, 42286,     0], dtype=int64)

In [28]:
number_of_movies = similarity.shape[0]
top_similar_indices = np.zeros((number_of_movies, 5), dtype=int)
for i in range(number_of_movies):
    similarity_argsort = np.argsort(similarity[i])
    top_similar_indices[i] = similarity_argsort[-6:-1]
    

In [29]:
top_similar_indices

array([[30630, 33766,  2988,  7206, 42286],
       [39237, 41082, 29957, 43476, 21527],
       [  454, 22527, 34111,   837,  3321],
       ...,
       [18944, 19074, 19650, 24009, 15991],
       [10110, 44303, 14972, 42521, 29849],
       [23690, 20322,  5938, 33195, 31985]])

In [30]:
top_similar_list = list(top_similar_indices)

top_similar_list


[array([30630, 33766,  2988,  7206, 42286]),
 array([39237, 41082, 29957, 43476, 21527]),
 array([  454, 22527, 34111,   837,  3321]),
 array([11721, 44151, 21369, 14123, 14644]),
 array([ 5569, 19729,  5808, 42133,  6793]),
 array([33384, 24802, 16470, 13360, 22126]),
 array([13533,  3468, 34913,   330, 10811]),
 array([14478, 39594, 33687,   643, 31295]),
 array([13076, 44309,   386, 43476, 14439]),
 array([ 3848,  2824, 18148,  7330,  2866]),
 array([ 8496,  2973, 20488, 35040, 41567]),
 array([ 8207, 33766, 13766,  1575, 14695]),
 array([44120, 25279, 44159, 19091, 43283]),
 array([ 5133, 15355, 22277, 40678, 27057]),
 array([11646, 38498,  9553, 31843, 24387]),
 array([ 7592, 17689, 13360,  5180, 27944]),
 array([33868, 23299, 24032, 27417, 32587]),
 array([25080, 44347, 31189, 41229, 36975]),
 array([24936, 42771,  5646,  9354, 43283]),
 array([30176, 11940,  5078, 29652, 25329]),
 array([24760,  5694, 36592,   316, 39020]),
 array([20434, 16712,  7671, 28193,   474]),
 array([ 2

In [31]:
pickle.dump(top_similar_list,open('./datasets/similar_movies_list.pkl','wb'))

In [34]:
filepath = './datasets/'
# Load pickle file with similarity matrix
with open(filepath+"similar_movies_list.pkl","rb") as f:
    recommendation_list = pickle.load(f)

df = pd.read_csv(filepath+'df_recommendation.csv')

def recommend(movie):
    index = df[df['title'] == movie].index[0]
    recommended_index = recommendation_list[index]

    recommendation = []
    for movie_index in recommended_index:
        recommendation.append(df.iloc[movie_index].title)

    return recommendation


print(recommend('Toy Story'))

['The Thirteenth Year', 'Sandesham', 'Toy Story 2', 'Walk Like a Man', 'John Apple Jack']


In [33]:
movie = 'Toy Story'
index = df[df['title'] == movie].index[0]
row_vector = np.array(vectors[index]).reshape(1, -1)
similarity = cosine_similarity(row_vector, vectors)[0]
distances = sorted(list(enumerate(similarity)),reverse=True,key = lambda x: x[1])
for i in distances[1:26]:
    print(i)
    print(df.iloc[i[0]].title)


(42286, 0.40768712416360564)
Dibu 3
(7206, 0.40406101782088427)
Walk Like a Man
(2988, 0.40050093945740706)
Awakenings
(33766, 0.3969420930187223)
Operation 'Happy New Year'!
(30630, 0.3903600291794132)
The Master
(44482, 0.3857583749052298)
Manitou's Shoe
(39953, 0.3822353935782191)
Skiptrace
(4759, 0.3779644730092272)
Heist
(17385, 0.3779644730092272)
Anton Chekhov's The Duel
(43283, 0.3779644730092272)
White Collar Blues
(619, 0.368925835063097)
Primal Fear
(1575, 0.3636964837266539)
Gattaca
(28230, 0.3626203338114211)
General Spanky
(25678, 0.35651204795369035)
Acapulco, prima spiaggia... a sinistra
(22635, 0.3546496828075953)
It Boy
(4402, 0.35355339059327373)
Permanent Record
(7709, 0.35355339059327373)
Make Way for Tomorrow
(25606, 0.35355339059327373)
Esther and the King
(41800, 0.3525551202038553)
¡A mí la legión!
(9572, 0.3502073841753286)
Little Miss Marker
(27484, 0.34992710611188255)
Santa's Pocket Watch
(37560, 0.34992710611188255)
Jack and the Beanstalk
(1917, 0.34914862

In [23]:
movie1 = 'Toy Story 2'
index = df[df['title'] == movie1].index[0]
row_vector1 = np.array(vectors[index]).reshape(1, -1)
movie2 = 'Toy Story'
index = df[df['title'] == movie2].index[0]
print(index)
row_vector2 = np.array(vectors[index]).reshape(1, -1)
similarity = cosine_similarity(row_vector1, row_vector2)[0]

similarity

0


array([0.14173668])