# Importing libraries and data

In [50]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


# Use hstack to concat two sparse matrices
from scipy.sparse import hstack

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel as kernel

movies = pd.read_csv('movies_metadata.csv',index_col=[0] ).reset_index()
movies = movies[movies.vote_count > 20].reset_index(drop=True)
movies['id'] = movies.id.astype(int)
movies = movies.rename(columns={'id': 'movie_id'})

keywords = pd.read_csv('keywords_new.csv.zip')
keywords.drop_duplicates(inplace=True)

# Rename id column to movie_id, and set it as index
keywords = keywords.rename(columns={'id': 'movie_id'})

# Merge 'movies' and 'keywords' tables on 'movie_id'
movies = pd.merge(movies, keywords, on = 'movie_id')
movies.shape

(15297, 25)

In [51]:
keywords

Unnamed: 0,movie_id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [57]:
keywords.keywords[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

# Content Based Recommendation

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a','and'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

tfidf_overview = tfidf.fit_transform(movies['overview'])
tfidf_overview.shape

(15297, 38911)

In [18]:
# cosine_sim = kernel(tfidf_overview, tfidf_overview)

## Applying KNN 

In [19]:
# from sklearn.neighbors import NearestNeighbors

# knn = NearestNeighbors(n_neighbors=10, algorithm='brute')

# knn.fit(tfidf_matrix)

# movies[movies.title=='The Godfather']

# knn.kneighbors(tfidf_matrix[544], 10, return_distance=False)

# movies.loc[15159]

In [20]:
#Construct a reverse map of indices and movie titles

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # If there are multiple movies with the same title then take the first one
    if isinstance(idx, pd.Series):
        idx = idx[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies[['title', 'imdb_id']].iloc[movie_indices]

In [21]:
import ast

# Converts json field to list of values
def convert_json_column(df, column_name):
    return df[column_name].\
               apply(ast.literal_eval).\
               apply(lambda x: [a['name'] for a in x]).\
               apply(lambda x: ','.join(x))

In [27]:
convert_json_column(movies,'genres')

0                Animation,Comedy,Family
1               Adventure,Fantasy,Family
2                         Romance,Comedy
3                   Comedy,Drama,Romance
4                                 Comedy
                      ...               
15292    Family,Animation,Romance,Comedy
15293                             Comedy
15294     Comedy,Fantasy,Science Fiction
15295            Fantasy,Action,Thriller
15296               Drama,Action,Romance
Name: genres, Length: 15297, dtype: object

## Using overviews with keywords

In [42]:
# Convert json field to comma separated keywords
movies['keywords'] = convert_json_column(movies, 'keywords')

In [8]:
# TfIdf keywords column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_keywords = tfidf.fit_transform(movies['keywords'])

In [9]:
# Concat keywords and overview tfidf matrices
tfidf_matrix = hstack([tfidf_keywords, tfidf_overview])

In [10]:
# Calculate similarity between all movie pairs
cosine_sim = kernel(tfidf_matrix, tfidf_matrix)

In [14]:
cosine_sim

array([[2.        , 0.04558618, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04558618, 2.        , 0.04794193, ..., 0.01503772, 0.        ,
        0.        ],
       [0.        , 0.04794193, 2.        , ..., 0.01570995, 0.        ,
        0.        ],
       ...,
       [0.        , 0.01503772, 0.01570995, ..., 2.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [45]:
get_recommendations('Toy Story', cosine_sim)

Unnamed: 0,title,imdb_id
8613,Toy Story 3,tt0435761
2054,Toy Story 2,tt0120363
11798,Toy Story That Time Forgot,tt3473654
12506,Barbie and the Three Musketeers,tt1484922
1183,Small Soldiers,tt0122718
4852,Dolls,tt0092906
1247,Child's Play,tt0094862
1441,Toys,tt0105629
11472,Small Fry,tt2033372
11470,Hawaiian Vacation,tt1850374
