In [20]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
import numpy as np

In [21]:

movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')
movies_with_credits = movies_df.merge(credits_df, left_on='id', right_on='movie_id')
movies_data = movies_with_credits[['title_x', 'vote_average', 'vote_count', 'overview']]
train_data, test_data = train_test_split(movies_data, test_size=0.2, random_state=42)


In [22]:

train_data['overview'] = train_data['overview'].fillna('')


In [23]:

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['overview'])


In [24]:

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [25]:

def get_recommendations(title):
    idx = train_data.index[train_data['title'] == title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:9]
    movie_indices = [i[0] for i in sim_scores]
    return train_data['title'].iloc[movie_indices]


In [26]:

full_data = pd.concat([train_data, test_data]).reset_index(drop=True)
full_data['overview'] = full_data['overview'].fillna('')
tfidf_matrix_full = tfidf_vectorizer.fit_transform(full_data['overview'])
cosine_sim_full = linear_kernel(tfidf_matrix_full, tfidf_matrix_full)


In [19]:

threshold = full_data['vote_count'].median()
recall_values = []
for idx, row in test_data.iterrows():
    recommendations = get_recommendations_full(row['title'])
    relevant_movies = full_data[full_data['vote_count'] > threshold]['title'].tolist()
    recall_values.append(recall_at_k(recommendations, relevant_movies))
average_recall_at_k = np.mean(recall_values)


KeyError: 'vote_count'

In [None]:

ndcg_values = []
for idx, row in test_data.iterrows():
    recommendations = get_recommendations_full(row['title']).tolist()
    relevant_movies_with_votes = {movie: vote for movie, vote in relevant_movies_with_votes.items() if movie in recommendations}
    if relevant_movies_with_votes:
        ndcg_values.append(calculate_ndcg(recommendations, relevant_movies_with_votes))
    else:
        ndcg_values.append(0)
average_ndcg = np.mean(ndcg_values)
