In [121]:
import numpy as np 
import json
import pandas as pd
import math, nltk, warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KDTree
from collections import Counter
warnings.filterwarnings("ignore")

In [122]:
#loading movies data
def load_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#loading crew data
def load_crew(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#renaming
column_rename = {'budget': 'budget','genres': 'genres','revenue': 'gross','title': 'movie_title','runtime': 'duration','original_language': 'language','keywords': 'plot_keywords','vote_count': 'num_voted_users'}

def miss_value(container, index_values):
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan
#getting directors    
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return miss_value(directors, [0])    
#converting to string
def join_names(keywords):
    return '|'.join([x['name'] for x in keywords])
#combining movies and crew
def join_two(movies, credits):
    movies_data = movies.copy()
    movies_data.rename(columns=column_rename, inplace=True)
    movies_data['title_year'] = pd.to_datetime(movies_data['release_date']).apply(lambda x: x.year)
    movies_data['country'] = movies_data['production_countries'].apply(lambda x: miss_value(x, [0, 'name']))
    movies_data['language'] = movies_data['spoken_languages'].apply(lambda x: miss_value(x, [0, 'name']))
    movies_data['director_name'] = credits['crew'].apply(get_director)
    movies_data['actor_1_name'] = credits['cast'].apply(lambda x: miss_value(x, [1, 'name']))
    movies_data['actor_2_name'] = credits['cast'].apply(lambda x: miss_value(x, [2, 'name']))
    movies_data['actor_3_name'] = credits['cast'].apply(lambda x:miss_value(x, [3, 'name']))
    movies_data['genres'] = movies_data['genres'].apply(join_names)
    movies_data['plot_keywords'] = movies_data['plot_keywords'].apply(join_names)
    return movies_data    
print('1')

1


In [124]:
credits = load_crew("../input/tmdb_5000_credits.csv")
movies = load_movies("../input/tmdb_5000_movies.csv")
data_set = join_two(movies, credits)


In [125]:
#dropping unwanted columns
data_set=data_set.drop(['homepage','tagline','status','spoken_languages','release_date','production_companies','production_countries','original_title','overview','vote_average'],axis=1)

In [126]:
#getting actors names
first_actors = set(data_set.actor_1_name.unique())
second_actors = set(data_set.actor_2_name.unique())
third_actors = set(data_set.actor_3_name.unique())
unique_genre_labels = set()
for genre_flags in data_set.genres.str.split('|').values:
    unique_genre_labels = unique_genre_labels.union(set(genre_flags))
for label in unique_genre_labels:
    data_set['Genre='+label] = data_set.genres.str.contains(label).astype(int)
data_set = data_set.drop('genres', axis=1)

#Dropping_duplicates
if len(data_set.drop_duplicates(subset=['movie_title',
                                  'title_year'])) < len(data_set):
    print('Duplicate Titles Exist')
    
    duplicates = data_set[data_set.movie_title.map(data_set.movie_title.value_counts() > 1)]
    duplicates.sort('movie_title')[['movie_title', 'title_year']]
   
    data_set = data_set.drop_duplicates(subset=['movie_title', 'title_year'])

    duplicates = data_set[data_set.movie_title.map(data_set.movie_title.value_counts() > 1)]
    duplicates.sort('movie_title')[['movie_title', 'title_year']]
   
    data_set = data_set.drop_duplicates(subset=['movie_title', 'title_year'])
    
counts = data_set.language.value_counts()
data_set.language = data_set.language.map(counts)

count = data_set.country.value_counts()
data_set.country = data_set.country.map(count)

print('1')

1


In [127]:
#getting unique keywords and handling categorical values
unique_words = set()
for wordlist in data_set.plot_keywords.str.split('|').values:
    if wordlist is not np.nan:
        unique_words = unique_words.union(set(wordlist))
plot_wordbag = list(unique_words)
for word in plot_wordbag:
    data_set['plot_has_' + word.replace(' ', '-')] = data_set.plot_keywords.str.contains(word).astype(float)
data_set = data_set.drop('plot_keywords', axis=1)

data_set.director_name = data_set.director_name.map(data_set.director_name.value_counts())

counts = pd.concat([data_set.actor_1_name, data_set.actor_2_name, data_set.actor_3_name]).value_counts()

data_set.actor_1_name = data_set.actor_1_name.map(counts)
data_set.actor_2_name = data_set.actor_2_name.map(counts)
data_set.actor_3_name = data_set.actor_3_name.map(counts)
data_set.select_dtypes(include=['O']).columns

print('2')

2


In [128]:
#dropping rows which has null values

data_set=data_set.dropna()

#storing movie names in an array

titles=data_set.movie_title
def recoms(names):
    movies = []
    for name in names:
        found = [i for i in titles if name.lower() in i.lower()]
        
        if len(found) > 0:
            movies.append(found[0])
    return  movies
data = data_set.drop('movie_title', axis=1)
data = MinMaxScaler().fit_transform(data)

#forming a KD Tree with our data

tree = KDTree(data, leaf_size=1)

#recommending movies based on users movies list

def final_recommendations(movies, tree, titles, data):
    titles = list(titles)
    length, recommendations = len(movies) + 1,[]
    
    for i, movie in enumerate(movies):
        weight = length - i
        dist, index = tree.query([data[titles.index(movie)]], k=4)
        for d, m in zip(dist[0], index[0]):
            recommendations.append((d*weight, titles[m]))
    recommendations.sort()
    rec = [i[1].strip() for i in recommendations if i[1] not in movies]
    
    rec = [i[1] for i in sorted([(v, k) for k, v in Counter(rec).items()],
                                reverse=True)]
    return rec

print('step 4')

step 4


In [130]:
#Finally,you can get some great movies!

user_movies = []
print('Type the number of movies u want to enter')
number=input()
print('Type',number,'movies')
for i in range(int(number)):
    user_input = input()
    user_movies += [user_input]

movies = recoms(user_movies)
result = final_recommendations(movies, tree, titles, data)
print('Recommended Movies are:')
ft = '{}. {}'
for i, item in enumerate(result[:len(result)]):
    print(ft.format(i + 1, item))


Type the number of movies u want to enter
5
Type 5 movies
3:10 to Yuma
Tombstone
Unforgiven
The Assassination of Jesse James by the Coward Robert Ford
Open Range
Recommended Movies are:
1. Western Religion
2. Shanghai Calling
3. Duel in the Sun
4. Slow Burn
5. S.W.A.T.
6. Red Riding: In the Year of Our Lord 1974
7. Nowhere to Run
8. Money Talks
9. Jane Got a Gun
10. Forsaken
11. F.I.S.T.
12. Broken Horses
