In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score , recall_score , f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the datasets
movies_metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')
links = pd.read_csv('links.csv')
links_small = pd.read_csv('links_small.csv')
ratings_small = pd.read_csv('ratings_small.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
# Handling missing values
movies_metadata = movies_metadata.dropna(subset=['title', 'id'])

In [None]:
# Convert ids to integers so that data can be merged
movies_metadata['id'] = movies_metadata['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)
credits['id'] = credits['id'].astype(int)

# Merging datasets on 'id'
merged_data = movies_metadata.merge(credits, on='id').merge(keywords, on='id')

# Merging with links also on 'id'
merged_data = merged_data.merge(links, left_on='id', right_on='tmdbId')

print(merged_data.head())

In [None]:
# Example EDA
print(movies_metadata.info()) 
#This provides a concise summary of the DataFrame, including the number of non-null entries in each column,
# the data type of each column, and the memory usage of the DataFrame which helps to define what models can ml models can be applied on it , as they all expect different datatypes 
print(movies_metadata.describe())

In [2]:
##if the else condition works , I know there is something wrong with my preprocessing
#helps us to visualise how much revenue and runtime has been generated by each movie . It gives us an idea of data distribution  
if 'revenue' in movies_metadata.columns and 'runtime' in movies_metadata.columns:
    plt.figure(figsize=(10, 6))

    plt.subplot(1, 2, 1)
    plt.hist(movies_metadata['revenue'])
    plt.xlabel('Revenue (USD)')
    plt.ylabel('Number of movies')
    plt.title('Distribution of Movie Revenue')

    plt.subplot(1, 2, 2)
    plt.hist(movies_metadata['runtime'])
    plt.xlabel('Runtime (minutes)')
    plt.ylabel('Number of movies')
    plt.title('Distribution of Movie Runtime')

    plt.tight_layout()
    plt.show()
else:
    print("Columns 'revenue' or 'runtime' are missing. Skipping histograms.")

NameError: name 'movies_metadata' is not defined

In [None]:
# Average vote average by genre (if applicable) , helps in ranking the highest watched genre . 
#We can give more weighhtage to this genre while recommending movies 
if 'genres' in movies_metadata.columns and 'vote_average' in movies_metadata.columns:
    # Impute missing values (replace with your preferred imputation method)
    imputed_movies_metadata = movies_metadata.fillna(method='ffill')  # Example using forward fill
    # Group by genres and calculate average vote
    genre_groups = imputed_movies_metadata.groupby('genres')['vote_average'].mean()
    print("\nAverage vote average by genre:")
    print(genre_groups.sort_values(ascending=False))  # Sort by highest average
    # Find the movie with the most votes
    movie_with_most_votes = imputed_movies_metadata.loc[imputed_movies_metadata['vote_count'].idxmax()]

else:
    print("Columns 'genres' or 'vote_average' are missing. Skipping average vote average by genre.")


In [None]:
# Calculate popularity score based on vote counts
popularity_score = ratings.groupby('movieId')['rating'].count().reset_index(name='popularity')

# Merge popularity score with ratings data
ratings = ratings.merge(popularity_score, on='movieId')


In [None]:

# Basic sentiment analysis on overview so that moovies with more poaitive reviews get more weightage while prediction 
#also didnt use bayesian , becuase it is too complex and time consuming for just a simple sentiment analysis 
def basic_sentiment(text):
  blob = TextBlob(text)  # Pass the text argument
  sentiment = blob.sentiment.polarity
  if sentiment > 0:
    return 'Positive'
  elif sentiment < 0:
    return 'Negative'
  else:
    return 'Neutral'


#Ensure 'overview' column has no NaN values by filling them with an empty string ensuring
#that every entry in the overview column is a string, which avoids the TypeError when TextBlob processes the text.
movies_metadata['overview'] = movies_metadata['overview'].fillna('')
movies_metadata['sentiment'] = movies_metadata['overview'].apply(basic_sentiment)
sentiment_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
movies_metadata['sentiment_score'] = movies_metadata['sentiment'].map(sentiment_map)


In [None]:
# Visualizing rating distribution
plt.figure(figsize=(10, 6))
sns.histplot(ratings_small['rating'], bins=10, kde=True)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
# Function to get the main cast
def get_main_cast(cast):
    return ', '.join([actor['name'] for actor in eval(cast)[:3]])

# Function to get the director
def get_director(crew):
    for member in eval(crew):
        if member['job'] == 'Director':
            return member['name']
    return ''

# Create a new column for main cast and director
merged_data['main_cast'] = merged_data['cast'].apply(get_main_cast)
merged_data['director'] = merged_data['crew'].apply(get_director)

# Combine genres, keywords, main cast, and director into a single string
merged_data['combined_features'] = merged_data.apply(
    lambda x: ' '.join(x['genres'] + ' ' + x['keywords'] + ' ' + x['main_cast'] + ' ' + x['director']),
    axis=1
)

In [None]:
#using content filtering 
def content_filtering(merged_data):
    # Create TF-IDF matrix for keywords
    tfidf = TfidfVectorizer(stop_words='english')
    merged_data['combined_features'] = merged_data['combined_features'].fillna('')
    tfidf_matrix = tfidf.fit_transform(merged_data['combined_features'])

    # Compute cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Integrate sentiment scores into the cosine similarity matrix
    sentiment_scores = movies_metadata['sentiment_score'].values
    adjusted_cosine_sim = cosine_sim * sentiment_scores[:, np.newaxis]

    # Plotting the similarity matrix as a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(adjusted_cosine_sim, cmap='viridis')
    plt.title('Movie Similarity Matrix')
    plt.xlabel('Movies')
    plt.ylabel('Movies')
    plt.show()

# Example usage
all_recommendations = content_filtering(merged_data)



In [None]:

#using only collabarative filtering 
# Example of k-NN for collaborative filtering

# Define collaborative filtering function
def collaborative_filtering(ratings, n_recommendations=5):
    # Create the user-item interaction matrix
    ratings_pivot = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    
    # Initialize and fit the k-NN model
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(ratings_pivot.values)
    
    # Store the movie ids and their respective indices
    movie_ids = ratings_pivot.columns.tolist()
    movie_indices = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    all_recommendations = {}
    
    # Iterate over all movies
    for movie_id in movie_ids:
        # Get the index of the movie
        movie_idx = movie_indices[movie_id]
        
        # Get the k-nearest neighbors for the movie
        distances, indices = model_knn.kneighbors(ratings_pivot.values[:, movie_idx].reshape(1, -1), n_neighbors=n_recommendations+1)
        
        # Get the indices of the nearest neighbors (excluding the movie itself)
        similar_indices = indices.flatten()[1:]
        
        # Map indices back to movie ids
        similar_movie_ids = [movie_ids[i] for i in similar_indices]
        
        # Store the recommendations
        all_recommendations[movie_id] = similar_movie_ids
    
    return all_recommendations

# Example usage
all_recommendations = collaborative_filtering(ratings)
print(all_recommendations)

In [None]:
#using both content and collabarative filtering 

def get_recommendations(title, cosine_sim=adjusted_cosine_sim):
    # Get the index of the movie that matches the title
    idx = merged_data[merged_data['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]


    # Get popularity scores of recommended movies
    popularity = ratings.groupby('movieId')['popularity'].max()  # Calculate popularity again
    popularity_scores = popularity.iloc[movie_indices]['popularity'].values

    # Adjust similarity scores with popularity scores
    sim_scores_popularity_adjusted = [sim_scores[i][1] * popularity_scores[i] for i in range(len(sim_scores))]

    # Sort the adjusted scores
    sim_scores_popularity_adjusted = sorted(zip(movie_indices, sim_scores_popularity_adjusted), key=lambda x: x[1], reverse=True)

    # Get the top 5 most similar movies with popularity adjustment
    top_movies = [merged_data.iloc[score[0]]['title'] for score in sim_scores_popularity_adjusted[:5]]

    return top_movies

In [None]:
#using both content and collabarative filtering 
def get_recommendations_for_all(merged_data,ratings):
    
    # Create TF-IDF matrix for keywords
    tfidf = TfidfVectorizer(stop_words='english')
    merged_data['combined_features'] = merged_data['combined_features'].fillna('')
    tfidf_matrix = tfidf.fit_transform(merged_data['combined_features'])

    # Compute cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Integrate sentiment scores into the cosine similarity matrix
    sentiment_scores = movies_metadata['sentiment_score'].values
    adjusted_cosine_sim = cosine_sim * sentiment_scores[:, np.newaxis]

    all_recommendations = {}

    # Iterate over all movie titles
    for idx in range(len(merged_data)):
        title = merged_data.iloc[idx]['title']
        
        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(adjusted_cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 5 most similar movies
        sim_scores = sim_scores[1:6]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Get popularity scores of recommended movies
        popularity = ratings.groupby('movieId')['popularity'].max()  # Calculate popularity again
        popularity_scores = popularity.iloc[movie_indices]['popularity'].values

        # Adjust similarity scores with popularity scores
        sim_scores_popularity_adjusted = [sim_scores[i][1] * popularity_scores[i] for i in range(len(sim_scores))]

        # Sort the adjusted scores
        sim_scores_popularity_adjusted = sorted(zip(movie_indices, sim_scores_popularity_adjusted), key=lambda x: x[1], reverse=True)

        # Get the top 5 most similar movies with popularity adjustment
        top_movies = [merged_data.iloc[score[0]]['title'] for score in sim_scores_popularity_adjusted[:5]]

        # Store recommendations
        all_recommendations[title] = top_movies

    return all_recommendations

# Example usage
all_recommendations = get_recommendations_for_all(merged_data,ratings)
print(all_recommendations)