In [None]:
# %%
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')

In [None]:
# Display first few rows of each dataset
display(movies_df.head())
display(ratings_df.head())
display(tags_df.head())

In [None]:
# Basic statistics
num_movies = len(movies_df)
num_users = len(ratings_df['userId'].unique())
num_ratings = len(ratings_df)
num_tags = len(tags_df)

print(f"Number of movies: {num_movies}")
print(f"Number of users: {num_users}")
print(f"Number of ratings: {num_ratings}")
print(f"Number of tags: {num_tags}")


In [None]:
# Analyze rating distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=ratings_df, x='rating', bins=10)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
# Analyze genres
movies_df['genres'] = movies_df['genres'].str.split('|')
all_genres = [genre for genres in movies_df['genres'] for genre in genres]
genre_counts = pd.Series(all_genres).value_counts()

plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar')
plt.title('Distribution of Movie Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create user-movie rating matrix
user_movie_matrix = ratings_df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)


In [None]:

# Compute movie similarity
movie_similarity = cosine_similarity(user_movie_matrix.T)

# %%
# Content-Based Filtering using TF-IDF on tags
tags_df['tag'] = tags_df['tag'].fillna('')
tag_data = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies_df = movies_df.merge(tag_data, on='movieId', how='left')
movies_df['tag'] = movies_df['tag'].fillna('')

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies_df['tag'])
content_movie_similarity = cosine_similarity(tfidf_matrix)



In [None]:

# Hybrid Similarity Calculation
hybrid_movie_similarity = (collab_movie_similarity + content_movie_similarity) / 2

# %%
# Select a movie to get recommendations (e.g., Toy Story with movieId = 1)
target_movie_id = 1
target_movie_idx = movies_df[movies_df['movieId'] == target_movie_id].index[0]
similar_scores = hybrid_movie_similarity[target_movie_idx]
similar_movies = list(enumerate(similar_scores))
similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)


In [None]:

# %%
# Evaluate recommendation accuracy
test_movies = movies_df['movieId'].sample(n=100, random_state=42)
accuracies = []

for movie_id in test_movies:
    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    similar_scores = hybrid_movie_similarity[movie_idx]
    similar_movies = list(enumerate(similar_scores))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    
    top_recommendations = [movies_df.iloc[idx]['movieId'] for idx, _ in similar_movies[1:6]]
    input_genres = set(movies_df[movies_df['movieId'] == movie_id]['genres'].iloc[0])
    correct_recs = sum(
        1 for rec_id in top_recommendations 
        if len(input_genres.intersection(set(movies_df[movies_df['movieId'] == rec_id]['genres'].iloc[0]))) > 0
    )
    accuracies.append(correct_recs / 5)

accuracy = np.mean(accuracies)
print(f"Recommendation Accuracy: {accuracy:.2f}")
