In [21]:
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [22]:
# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [23]:
# Merge datasets on 'title'
movies = movies.merge(credits, on='title')

# Select relevant columns, including 'id'
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]  # Added 'id'

In [24]:
# Function to convert JSON-like strings to lists
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [25]:
# Handle missing values and apply the convert function
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Function to fetch top 3 cast members
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
            counter += 1
    return L 

movies['cast'] = movies['cast'].apply(convert3)

In [26]:
# Function to fetch the director from crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L 

movies['crew'] = movies['crew'].apply(fetch_director)

In [27]:
# Handle overview: convert to list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces from names to avoid tokenization issues
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [28]:
# Create a 'tags' column by combining all features
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with 'id', 'title', and 'tags'
new_df = movies[['id', 'title', 'tags']]  # Added 'id'

# Convert tags back to a string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [29]:
# Stemming
ps = PorterStemmer()
def stems(text):
    T = []
    for i in text.split():
        T.append(ps.stem(i))
    return " ".join(T)

new_df['tags'] = new_df['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


In [30]:
# Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

# Compute similarity matrix
similarity = cosine_similarity(vector)

# Save the DataFrame as a dictionary to pickle file
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))

# Save the similarity matrix to pickle file
pickle.dump(similarity, open('similarity.pkl', 'wb'))