# 1. Importing Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json

# 2. Loading Data 

In [2]:
# Load the movie and credit datasets from CSV files
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# 3. Merging Datasets

In [3]:
# Merge the two datasets on the 'title' column
movies = movies.merge(credits, on='title')
movies.head() # preview first few rows

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# 4. Data Cleaning and Feature Selection
   ## 1. Selecting Essential Columns

In [4]:
# Keep only the essential columns for our recommender system
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

## 2.Defining Helper Functions to Parse JSON Data

In [5]:
# Define helper functions to parse JSON strings and extract specific information
def parse_json(text):
    """Safely parses a JSON string into a Python object."""
    try:
        return json.loads(text)
    except:
        return []

def get_genres(text):
    """Extracts a list of genre names from a JSON string."""
    data = parse_json(text)
    return [d['name'] for d in data if 'name' in d]

def get_keywords(text):
    """Extracts a list of keyword names from a JSON string."""
    data = parse_json(text)
    return [d['name'] for d in data if 'name' in d]

def get_top_cast(text):
    """Extracts the names of the top 3 cast members from a JSON string."""
    data = parse_json(text)
    return [d['name'] for d in data[:3] if 'name' in d]

def get_director(text):
    """Extracts the name of the director from a JSON string."""
    data = parse_json(text)
    for d in data:
        if d.get('job') == 'Director':
            return [d['name']]
    return []

## 3. Applying Helper Functions to Clean Columns

In [6]:
# Apply the helper functions to clean the respective columns
movies['genres'] = movies['genres'].apply(get_genres)
movies['keywords'] = movies['keywords'].apply(get_keywords)
movies['cast'] = movies['cast'].apply(get_top_cast)
movies['crew'] = movies['crew'].apply(get_director)

## 4. Combining Features and Lowercasing

In [7]:
# Combine all relevant features into a single 'tags' column
# We also convert the tags to lowercase for consistency
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())

## 5. Dropping Original Columns

In [8]:
# Drop the individual feature columns as they are now combined in 'tags'
movies = movies[['movie_id', 'title', 'tags']]

# 5. Vectorization and Similarity Calculation
## 1. Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer to convert text into a matrix of token counts
# We limit to the top 5000 features and remove common English stop words
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the 'tags' column to create the feature vectors
vectors = cv.fit_transform(movies['tags']).toarray()

## 2. Calculating Cosine Similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix for all movies
similarity = cosine_similarity(vectors)

# 6. Recommendation Function

In [11]:
def recommend(movie):
    """
    Recommends top 5 movies similar to the given movie title.
    
    Args:
        movie (str): The title of the movie to get recommendations for.
    """
    # Find the index of the input movie
    movie_index = movies[movies['title'] == movie].index[0]
    
    # Get the similarity scores for the input movie with all other movies
    distances = similarity[movie_index]
    
    # Sort the movies by similarity score in descending order
    # The [1:6] slice gets the top 5 recommendations (excluding the movie itself)
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Print the titles of the recommended movies
    print("Recommendations for {}:".format(movie))
    for i in movie_list:
        print(movies.iloc[i[0]].title)

# 7. Testing the Recommender

In [13]:
# Get recommendations for 'Avatar'
recommend('Avatar')

# Get recommendations for 'John Carter'
recommend('John Carter')

Recommendations for Avatar:
Aliens
Alien³
Alien
Star Trek Into Darkness
Silent Running
Recommendations for John Carter:
Mission to Mars
Ender's Game
The 5th Wave
The Hitchhiker's Guide to the Galaxy
The Host


# 8. Saving the Model

In [12]:
import pickle

# Save the movies DataFrame and the similarity matrix to files
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))