
Here's an attempt to create a recommendation engine with this dataset. Our Naive assumption is that a person's taste in film does not evolve with time.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
%pylab inline
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
credits = load_tmdb_credits("../input/tmdb_5000_credits.csv")
movies = load_tmdb_movies("../input/tmdb_5000_movies.csv")
print('o')

Since cleaning the data is not the focus of this notebook, I'll just dump it all in one cell. That way we can skip over to the nice parts.

In [None]:
import json
import pandas as pd
#___________________________
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________________
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews']
#____________________________________
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}
#_____________________________________________________
IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}
#_____________________________________________________
def safe_access(container, index_values):
    # return missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan
#_____________________________________________________
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])
#_____________________________________________________
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])
#_____________________________________________________
def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [3, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies
print('Step 1')

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math, nltk, warnings
from nltk.corpus import wordnet
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from wordcloud import WordCloud, STOPWORDS
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings('ignore')
PS = nltk.stem.PorterStemmer()
#__________________
# load the dataset

df = convert_to_original_format(movies, credits)
print('Shape:',df.shape)
#__________________________________________



In [None]:
df=df.drop(['homepage','tagline','status','spoken_languages','release_date','production_companies','production_countries','original_title','overview','vote_average'],axis=1)


In [None]:
df.info(verbose=True)

In [None]:
first_actors = set(df.actor_1_name.unique())
second_actors = set(df.actor_2_name.unique())
third_actors = set(df.actor_3_name.unique())
print('Those only in first name', len(first_actors - second_actors - third_actors))
print('Those only in second name', len(second_actors - first_actors - third_actors))
print('Those only in third name', len(third_actors - first_actors - second_actors))
unique_genre_labels = set()
for genre_flags in df.genres.str.split('|').values:
    unique_genre_labels = unique_genre_labels.union(set(genre_flags))
for label in unique_genre_labels:
    df['Genre='+label] = df.genres.str.contains(label).astype(int)
df = df.drop('genres', axis=1)

# Titles are supposed to be unique right?
if len(df.drop_duplicates(subset=['movie_title',
                                  'title_year'])) < len(df):
    print('Duplicate Titles Exist')
    # Let's see these duplicates.
    duplicates = df[df.movie_title.map(df.movie_title.value_counts() > 1)]
    duplicates.sort('movie_title')[['movie_title', 'title_year']]
    # Looks like there are duplicates after all. Let's drop those.
    df = df.drop_duplicates(subset=['movie_title', 'title_year'])

    duplicates = df[df.movie_title.map(df.movie_title.value_counts() > 1)]
    duplicates.sort('movie_title')[['movie_title', 'title_year']]
    # Looks like there are duplicates after all. Let's drop those.
    df = df.drop_duplicates(subset=['movie_title', 'title_year'])
    # df.info()
counts = df.language.value_counts()
df.language = df.language.map(counts)
#df.language
count = df.country.value_counts()
df.country = df.country.map(count)

print('1')


In [None]:
print('start')
unique_words = set()
for wordlist in df.plot_keywords.str.split('|').values:
    if wordlist is not np.nan:
        unique_words = unique_words.union(set(wordlist))
plot_wordbag = list(unique_words)
for word in plot_wordbag:
    df['plot_has_' + word.replace(' ', '-')] = df.plot_keywords.str.contains(word).astype(float)
df = df.drop('plot_keywords', axis=1)
# Is anything left to be done other than imputing?
print(df.select_dtypes(include=['O']).columns)
# We replace director name with counts of movies they've done
df.director_name = df.director_name.map(df.director_name.value_counts())
# We replace actor names with the number of movies they appear in.
counts = pd.concat([df.actor_1_name, df.actor_2_name, df.actor_3_name]).value_counts()
#counts.head()
df.actor_1_name = df.actor_1_name.map(counts)
df.actor_2_name = df.actor_2_name.map(counts)
df.actor_3_name = df.actor_3_name.map(counts)
# I have no clue what to do with the title. I'll keep it for now in order to search by name

# Let's check if anything is left as object
df.select_dtypes(include=['O']).columns
# Titles are supposed to be unique right?
print('2')

In [None]:
print(df['director_name'].isnull().sum())
print(df['popularity'].isnull().sum())


# Now the data is clean enough. Recommend already!
It's filled with holes though. Pun intended. :D

I wanted to try out some fancy imputation (there's a package by that name too) so here goes.

In [None]:
# hold your horses, we still need to fill those missing values.
new_style = {'grid': False}
matplotlib.rc('axes', **new_style)
plt.matshow(~df.isnull())
plt.title('Missing values in the data')

In [None]:
# Let's get those rows which are mostly incomplete. I suspect this was because of our
# new features being created from old ones which were null.
nullcount = df.isnull().sum(axis=1)
# Let's just keep those who have less than a hundred missing values
ndf = df.dropna(thresh=100)
print(ndf.shape, df.shape)
# Let's see those nulls again

plt.matshow(~ndf.isnull())
plt.title('Missing values in the data')


In [None]:
# We'll treat fillna as a regression / classification problem here.
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def reg_class_fill(df, column, classifier):
    """Treat missing values as a classification / regresion problem"""
    ndf = df.dropna(subset=[col for col in df.columns if col != column])
    nullmask = ndf[column].isnull()
    
    train, test  = ndf[~nullmask], ndf[nullmask]
    
    train_x, train_y = train.drop(column, axis=1), train[column]
    classifier.fit(train_x, train_y)
    if len(test) > 0:
        test_x, test_y = test.drop(column, axis=1), test[column]
        values = classifier.predict(test_x)
        test_y = values
        new_x, new_y = pd.concat([train_x, test_x]), pd.concat([train_y, test_y])
        newdf = new_x[column] = new_y
        return newdf
    else:
        return ndf
print('step 4')

In [None]:
r, c = KNeighborsRegressor, KNeighborsClassifier  # Regress or classify
title_encoder = LabelEncoder()
title_encoder.fit(ndf.movie_title)
ndf.movie_title = title_encoder.transform(ndf.movie_title)

In [None]:

print(ndf.popularity)

In [None]:
# Since our imputation will impact other imputations, we specify an order
# Typically we should do this independently and then combine the results, but meh for now
impute_order = [('budget', r)]
for col, classifier in impute_order:
    ndf = reg_class_fill(ndf, col, classifier())
    print(col, 'Done')

In [None]:
# Again we check for what else needs to be imputed.
ndf[ndf.columns[:25]].isnull().sum()

In [None]:
# Did we get everything?
ndf.isnull().sum().sum()

In [None]:
# YAY! We did indeed get everything, though it may not have been very good.
# Now we redo the movie title transformation for our searches.
titles = title_encoder.inverse_transform(ndf.movie_title)

#titles = [i.lower().strip() for i in titles]

# And we are ready to recommend stuff to you love :D
We build a simple KD tree recommender.

In [None]:
# Give us 5 movies that you liked
def get_movies(names):
    movies = []
    for name in names:
        found = [i for i in titles if name.lower() in i.lower()]
        
        if len(found) > 0:
            movies.append(found[0])
            print(name, ': ', found, 'added', movies[-1], 'to movies')
        else:
            print(name, ': ', found)
    print('-'*10)
    moviecodes = title_encoder.transform(movies)
    return moviecodes, movies
names = ['fight club', 'gump', # This one is Forrest Gump
                 'usual suspects', 'silence of the lambs']
moviecodes, movies = get_movies(names)

In [None]:
data = ndf.drop('movie_title', axis=1)
data = MinMaxScaler().fit_transform(data)
print(data)

In [None]:
# We assume KNN's assumptions as valid and proceede to compute a distance_matrix
from sklearn.neighbors import KDTree
from collections import Counter

In [None]:
movies

In [None]:
titles

In [None]:
tree = KDTree(data, leaf_size=1)


In [None]:
def recommend(movies, tree, titles, data):
    """
    It is assumed that the movies are in order of decreasing like-able-ness
    Recommend movies on the basis of the KDTree generated.
    Return them in order of increasing distance form knowns.
    """
    titles = list(titles)
    length, recommendations = len(movies) + 1,[]
    
    for i, movie in enumerate(movies):
        weight = length - i
        dist, index = tree.query([data[titles.index(movie)]], k=3)
        for d, m in zip(dist[0], index[0]):
            recommendations.append((d*weight, titles[m]))
    recommendations.sort()
    # Stuff is reorganized by frequency.
    

    rec = [i[1].strip() for i in recommendations if i[1] not in movies]
    print(rec,'kir')
    rec = [i[1] for i in sorted([(v, k) for k, v in Counter(rec).items()],
                                reverse=True)]
    return rec

In [None]:

rec = recommend(movies, tree, titles, data)

print('Rank | Movie')
print('-----|------')
fmt = '{}.   | {}'
for index, movie in enumerate(rec[:10]):
    print(fmt.format(index + 1, movie))

# Tadaa!
It's not very neat and awesome! But I did like Untraceable to be honest. 
Some movies are recommended twice! Probably because they are quiet close to multiple choices.

## What else can be done?

- Feature generation: I've done a nasty job of generating features. That could be cleaned up.
- Imputation: A better way of imputing is welcome. Perhaps even need I say.
- Some other recommendation method: So far I've only been able to discover KDTrees. If someone could write another one, awesome!

*Upvote* to show your appreciation. :D

# The final product

1. Get movie titles
2. Recommend

In [None]:
names = ['unbreakable','django unchained','the sin city','the hobbit','everest','unknown','the grey','superman'] # dedicated to A.S.
moviecodes, movies = get_movies(names)
rec = recommend(movies, tree, titles, data)
print('-'*50)
print('Recommending on the basis of the above movies')
print('-'*50)
print()
print('+-----|------')
print('|Rank | Movie')
print('+-----|------')
fmt = '|{}.   | {}'
for index, movie in enumerate(rec[:len(rec)]):
    print(fmt.format(index + 1, movie))
print('+-----|------')