# **Import necessary libraries**

In [23]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# **Load Datasets**

In [24]:
movie = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv("data/tmdb_5000_credits.csv")

  credits = pd.read_csv("data/tmdb_5000_credits.csv")


# **Merge both datasets on title and keep important columns**

In [25]:
movies = movie.merge(credits, on='title')

movies = movies[['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

# **Drop rows with null values and display few rows**

In [26]:
movies.dropna(inplace=True)

movies.head()

Unnamed: 0,id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# **Process 'overview' column**

In [27]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# **Process 'keywords' column**

In [28]:
def convert_keywords(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['keywords'] = movies['keywords'].apply(convert_keywords)

# **Process 'genres' column**

In [29]:
def convert_genres(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['genres'] = movies['genres'].apply(convert_genres)

# **Process 'cast' column (top 3 only)**

In [30]:
def extract_cast(obj):
    l = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            l.append(i['name'])
            count += 1
        else:
            break
    return l

movies['cast'] = movies['cast'].apply(extract_cast)

# **Process Crew column for extracting Director**

In [31]:
def extract_director(obj):
    # Handle NaN / None
    if pd.isna(obj):
        return []

    # If it's already a list, don't eval it
    if isinstance(obj, list):
        crew = obj
    else:
        try:
            crew = ast.literal_eval(obj)
        except (ValueError, SyntaxError):
            return []

    # Extract director
    for i in crew:
        if i.get('job') == 'Director':
            return [i.get('name')]
    return []

movies['crew'] = movies['crew'].apply(extract_director)

In [32]:
movies.columns

Index(['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew'], dtype='object')

# **Converts list valued 'overview' column to space joined text**

In [33]:
movies['overview'] = movies['overview'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else str(x)
).fillna('')

# **Converts list valued 'genre','keywords','cast','crew' column to space joined text and combine it to form 'tags' column**

In [34]:
def join_list(x):
    return ' '.join(x) if isinstance(x, list) else ''

movies['tags'] = (
    movies['overview']
    + ' ' + movies['genres'].apply(join_list)
    + ' ' + movies['keywords'].apply(join_list)
    + ' ' + movies['cast'].apply(join_list)
    + ' ' + movies['crew'].apply(join_list)
)

movies = movies[['id', 'title', 'tags']]
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# **Text-preprocessing + Stemming pipeline**

In [35]:
movies['tags'] = movies['tags'].str.lower()
movies['tags'] = movies['tags'].str.replace('[^a-z0-9 ]', ' ', regex=True)
movies['tags'] = movies['tags'].str.replace(r'\s+', ' ', regex=True)

ps = PorterStemmer()

def stem_text(text):
    if not isinstance(text, str):
        return ''
    return " ".join(ps.stem(word) for word in text.split())

movies['tags'] = movies['tags'].apply(stem_text)

# **Vectorization using CountVectorizer and Cosine similarity**

In [36]:
cv = CountVectorizer(
    max_features=500,
    stop_words='english'
)

vectors_cv = cv.fit_transform(movies['tags'])
similarity_cv = cosine_similarity(vectors_cv)

# **Vectorization using TF-IDF**

In [37]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

vectors_tfidf = tfidf.fit_transform(movies['tags'])
similarity_tfidf = cosine_similarity(vectors_tfidf)

# **Recommendation function**

In [38]:
def recommend(movie_title, top_n=10):
    if movie_title not in movies['title'].values:
        print("Movie not found in dataset.")
        return

    movie_index = movies[movies['title'] == movie_title].index[0]

    distances = sorted(
        list(enumerate(similarity_cv[movie_index])),
        key=lambda x: x[1],
        reverse=True
    )

    print(f"Recommendations for '{movie_title}':\n")

    for i in distances[1:top_n + 1]:
        print(movies.iloc[i[0]]['title'])

# **Example usage**

In [39]:
recommend("Avatar")

Recommendations for 'Avatar':

Mission to Mars
Independence Day
Planet of the Apes
John Carter
Treasure Planet
Lost in Space
Dune
Alien³
Starship Troopers
AVP: Alien vs. Predator


# **Saving model, data and vectorizer and loading it for future use**

In [40]:
pickle.dump(movies, open('movies_cv.pkl', 'wb'))
pickle.dump(similarity_cv, open('similarity_cv.pkl', 'wb'))
pickle.dump(cv, open('vectorizer_cv.pkl', 'wb'))

In [41]:
movies_cv = pickle.load(open('movies_cv.pkl', 'rb'))
similarity_cv = pickle.load(open('similarity_cv.pkl', 'rb'))
vectorizer_cv = pickle.load(open('vectorizer_cv.pkl', 'rb'))

# **Saving model, data and tf-idf and loading it for future use**

In [42]:
pickle.dump(movies, open('movies_tfidf.pkl', 'wb'))
pickle.dump(similarity_tfidf, open('similarity_tfidf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [43]:
movies_tfidf = pickle.load(open('movies_tfidf.pkl', 'rb'))
similarity_tfidf = pickle.load(open('similarity_tfidf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))