In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


In [2]:
# Importing CSV files into DataFrames
movies_df = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv', usecols = [3,5,20])
credits_df = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')

In [3]:
# Removing non-numeric values from movies_df['id']
movies_df.drop(movies_df[~movies_df['id'].astype(str).str.match(r'^\d{1,7}$')].index, inplace=True)

# Converting movies_df['id'] to int16
movies_df['id'] = movies_df['id'].astype('int16')

# Merging movies_df and credits_df over the common column 'id'
movies_and_credits = pd.merge(movies_df, credits_df, on='id')

In [4]:
# Droping duplicates from movies_and_credits['id']
movies_and_credits = movies_and_credits.drop_duplicates(subset='id')

# Remove null values
movies_and_credits.dropna(inplace=True)

# Resetting row indices
movies_and_credits = movies_and_credits.reset_index(drop=True)

In [5]:
# Extracting only the needed name substrings from genres, cast and crew columns

import re

genre_values = []

for movie_genres in movies_and_credits['genres']:
    genre_values.append(re.findall("'name': '([^']*)'", movie_genres))

movies_and_credits['genres'] = genre_values

credits_actors = []

for actors in movies_and_credits['cast']:
    credits_actors.append(re.findall("'name': '([^']*)'", actors))
    
movies_and_credits['actors'] = credits_actors

credits_director = []

for director in movies_and_credits['crew']:
    credits_director.append(re.findall("'Director', 'name': '([^']*)'", director))
    
movies_and_credits['director'] = credits_director

In [6]:
# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(tokenizer=lambda x: x.split('|'))

genres_cv = cv.fit_transform(movies_and_credits['genres'].apply(lambda x: '|'.join(x)))

actors_cv = cv.fit_transform(movies_and_credits['actors'].apply(lambda x: '|'.join(x)))

directors_cv = cv.fit_transform(movies_and_credits['director'].apply(lambda x: '|'.join(x)))



In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Building similarity matrices and combining them
genres_sim = cosine_similarity(genres_cv)
actors_sim = cosine_similarity(actors_cv)
directors_sim = cosine_similarity(directors_cv)

sim = genres_sim + actors_sim + directors_sim

In [8]:
# Normalizing the similarity matrix
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
sim = scaler.fit_transform(sim)

# Creating a mapping between movie titles and indices
indices = pd.Series(movies_and_credits.index, index=movies_and_credits['title'])

# Definition for recommending movies
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_and_credits['title'].iloc[movie_indices]

In [9]:
'''#Alternative definition that returns the title and similarity score as percentage
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = [(movies_and_credits['title'][i], score) for i, score in sim_scores]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    title_and_sim_df = pd.DataFrame(sim_scores, columns=['title', 'similarity_score'])
    title_and_sim_df['similarity_score'] = ["{:.0%}".format(score) for score in title_and_sim_df['similarity_score']]
    return title_and_sim_df'''

'#Alternative definition that returns the title and similarity score as percentage\ndef get_recommendations(title):\n    idx = indices[title]\n    sim_scores = list(enumerate(sim[idx]))\n    sim_scores = [(movies_and_credits[\'title\'][i], score) for i, score in sim_scores]\n    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n    sim_scores = sim_scores[1:11]\n    title_and_sim_df = pd.DataFrame(sim_scores, columns=[\'title\', \'similarity_score\'])\n    title_and_sim_df[\'similarity_score\'] = ["{:.0%}".format(score) for score in title_and_sim_df[\'similarity_score\']]\n    return title_and_sim_df'

In [10]:
get_recommendations('Se7en')

7988                                 Zodiac
1136                               The Game
3807                             Panic Room
9108    The Curious Case of Benjamin Button
2126                             Fight Club
6661                        A Show of Force
934                                  Alien³
8306                       Eastern Promises
8425                                  Awake
626                      The Maltese Falcon
Name: title, dtype: object