In [7]:

# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Inspect the datasets
print("Movies Dataset:")
print(movies.head(2))
print("\nCredits Dataset:")
print(credits.head(2))

# Merge the datasets on the 'title' column
movies = movies.merge(credits, on='title')

# Retain only the necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# **Helper Functions**

# Convert JSON-like strings into lists of names
def convert_to_list(text):
    try:
        return [item['name'] for item in ast.literal_eval(text)]
    except ValueError:
        return []

# Extract the top 3 cast members
def extract_top_cast(text):
    try:
        return [item['name'] for item in ast.literal_eval(text)[:3]]
    except ValueError:
        return []

# Extract the director from the crew data
def extract_director(text):
    try:
        return [item['name'] for item in ast.literal_eval(text) if item['job'] == 'Director']
    except ValueError:
        return []

# Remove spaces from names to create uniform tags
def remove_spaces(L):
    return [item.replace(" ", "") for item in L]

# Apply the helper functions to process the data
movies['genres'] = movies['genres'].apply(convert_to_list)
movies['keywords'] = movies['keywords'].apply(convert_to_list)
movies['cast'] = movies['cast'].apply(extract_top_cast)
movies['crew'] = movies['crew'].apply(extract_director)

# Normalize the processed columns
movies['genres'] = movies['genres'].apply(remove_spaces)
movies['keywords'] = movies['keywords'].apply(remove_spaces)
movies['cast'] = movies['cast'].apply(remove_spaces)
movies['crew'] = movies['crew'].apply(remove_spaces)

# Split the overview text into individual words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combine all important features into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# Retain only the columns required for the recommendation system
movies = movies[['movie_id', 'title', 'tags']]

# **Feature Extraction and Similarity Calculation**

# Convert text data into numerical feature vectors using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(movies['tags']).toarray()

# Calculate similarity scores between movies
similarity = cosine_similarity(vector)

# **Recommendation Function**

# Recommend movies based on similarity
def recommend(movie_title):
    if movie_title not in movies['title'].values:
        print(f"'{movie_title}' not found in the dataset.")
        return
    
    # Get the index of the movie
    movie_index = movies[movies['title'] == movie_title].index[0]
    
    # Compute similarity scores and sort them
    distances = sorted(list(enumerate(similarity[movie_index])), key=lambda x: x[1], reverse=True)
    
    print(f"\nMovies similar to '{movie_title}':")
    for i in distances[1:6]:  # Skip the first one as it's the movie itself
        print(movies.iloc[i[0]].title)

# Test the recommendation system
recommend("The Lego Movie")

# **Saving the Model and Data**

# Save the processed data and similarity matrix for reuse
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


Movies Dataset:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   

                                       homepage     id  \
0                   http://www.avatarmovie.com/  19995   
1  http://disney.go.com/disneypictures/pirates/    285   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   

                                            overview  popularity  \
0  In the 22nd century, a paraplegic Marine is di...  150.437577   
1  Captain Barbossa, long believed to be dead, ha...  139.082615   

                                produ