In [1]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.preprocessing import MinMaxScaler
import numpy as np
# Load datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
links = pd.read_csv('links.csv')

# Display the first few rows
print("Movies:\n", movies.head())
print("\nRatings:\n", ratings.head())
print("\nTags:\n", tags.head())
print("\nLinks:\n", links.head())

print("\nData types:")
print(movies.dtypes)
print(ratings.dtypes)
print(tags.dtypes)
print(links.dtypes)

Movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Tags:
    userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2    

In [2]:
# Check for missing values
print("Missing values in each dataset:")
print("Movies:\n", movies.isnull().sum())
print("Ratings:\n", ratings.isnull().sum())
print("Tags:\n", tags.isnull().sum())
print("Links:\n", links.isnull().sum())


Missing values in each dataset:
Movies:
 movieId    0
title      0
genres     0
dtype: int64
Ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Tags:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
Links:
 movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [3]:
# Drop rows with missing tmdbId
links.dropna(subset=['tmdbId'], inplace=True)
links['tmdbId'] = links['tmdbId'].astype(int)

In [4]:
# Check and drop duplicates
print("Duplicates before dropping:")
print("Movies:", movies.duplicated().sum())
print("Ratings:", ratings.duplicated().sum())
print("Tags:", tags.duplicated().sum())
print("Links:", links.duplicated().sum())

Duplicates before dropping:
Movies: 0
Ratings: 0
Tags: 0
Links: 0


In [5]:
# 5. Basic stats on ratings
print("\nRatings stats:")
print(ratings['rating'].describe())

# 6. Number of unique users and movies
print("\nUnique users:", ratings['userId'].nunique())
print("Unique movies:", ratings['movieId'].nunique())

# Display distribution of number of ratings per user (optional exploration)
ratings_per_user = ratings.groupby('userId').size()
print("Ratings per user statistics:")
print(ratings_per_user.describe())


Ratings stats:
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

Unique users: 610
Unique movies: 9724
Ratings per user statistics:
count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
dtype: float64


In [6]:
# Merge movies with ratings for later use
movie_ratings = pd.merge(ratings, movies, on='movieId')
print(movie_ratings.head())


   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [7]:
# Replace '|' with space so TF-IDF can treat each genre as a separate word
movies['genres_clean'] = movies['genres'].str.replace('|', ' ', regex=False)


In [8]:
# Create TF-IDF matrix for genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres_clean'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (9742, 24)


In [9]:


# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Display shape (should be 9742 x 9742)
print("Cosine similarity matrix shape:", cosine_sim.shape)


Cosine similarity matrix shape: (9742, 9742)


In [10]:
# Create a reverse mapping: movie title → index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim, n=10):
    # Get index of the movie
    idx = indices.get(title)
    if idx is None:
        return f" Movie '{title}' not found in the dataset."

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return top N similar movie titles
    return movies['title'].iloc[movie_indices].tolist()


In [11]:
print(recommend_movies("Toy Story (1995)", n=5))


['Antz (1998)', 'Toy Story 2 (1999)', 'Adventures of Rocky and Bullwinkle, The (2000)', "Emperor's New Groove, The (2000)", 'Monsters, Inc. (2001)']


In [12]:
!conda install -c conda-forge scikit-surprise -y


Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [13]:
# Define rating scale and load data from your DataFrame
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [14]:
# Initialize and train the model
model = SVD()
model.fit(trainset)

# Predict on the test set
predictions = model.test(testset)

# Evaluate accuracy
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


RMSE: 0.8819
RMSE: 0.8818923424849298
MAE:  0.6778
MAE: 0.6777633597960264


In [15]:
def get_top_n_recommendations(user_id, model, ratings_df, movies_df, n=10):
    # Get list of all movie IDs
    all_movie_ids = movies_df['movieId'].unique()
    
    # Get movies the user has already rated
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
    
    # Filter out rated movies
    unseen_movies = [mid for mid in all_movie_ids if mid not in rated_movie_ids]
    
    # Predict ratings for all unseen movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unseen_movies]
    
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top n movie IDs
    top_n_movie_ids = [int(pred.iid) for pred in predictions[:n]]
    
    # Get movie titles
    top_n_movies = movies_df[movies_df['movieId'].isin(top_n_movie_ids)]['title'].values
    
    return top_n_movies


In [16]:
# Recommend movies for user ID 1
recommendations = get_top_n_recommendations(1, model, ratings, movies, n=5)
print("Recommended movies for User 1:", recommendations)


Recommended movies for User 1: ['Shawshank Redemption, The (1994)' 'Blade Runner (1982)'
 'Wallace & Gromit: The Best of Aardman Animation (1996)'
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)'
 'Philadelphia Story, The (1940)']


In [22]:
def get_collab_scores(user_id, model, movies_df, ratings_df):
    all_movie_ids = movies_df['movieId'].unique()
    rated_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
    unseen_ids = [mid for mid in all_movie_ids if mid not in rated_ids]

    predictions = [model.predict(user_id, mid) for mid in unseen_ids]
    movie_ids = [int(p.iid) for p in predictions]
    scores = [p.est for p in predictions]

    # Normalize scores
    scaler = MinMaxScaler()
    norm_scores = scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

    return dict(zip(movie_ids, norm_scores))


In [23]:
def get_content_scores(movie_id, cosine_sim, movies_df):
    idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Remove itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
    
    # Normalize
    movie_ids = [movies_df.iloc[i[0]]['movieId'] for i in sim_scores]
    scores = [i[1] for i in sim_scores]

    scaler = MinMaxScaler()
    norm_scores = scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

    return dict(zip(movie_ids, norm_scores))


In [24]:
def hybrid_recommend(user_id, liked_movie_id, model, ratings_df, movies_df, cosine_sim, alpha=0.5, top_n=10):
    collab = get_collab_scores(user_id, model, movies_df, ratings_df)
    content = get_content_scores(liked_movie_id, cosine_sim, movies_df)

    # Combine scores
    combined_scores = {}
    for mid in collab:
        c_score = content.get(mid, 0)
        hybrid_score = alpha * collab[mid] + (1 - alpha) * c_score
        combined_scores[mid] = hybrid_score

    # Sort and get top N
    top_movies = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_movie_ids = [mid for mid, score in top_movies]
    return movies_df[movies_df['movieId'].isin(top_movie_ids)]['title'].tolist()


In [25]:
def precision_recall_at_k(recommended, relevant, k=10):
    recommended = recommended[:k]
    hits = len(set(recommended) & set(relevant))
    precision = hits / k
    recall = hits / len(relevant) if relevant else 0
    return precision, recall

def evaluate_hybrid(user_id, liked_movie_id, model, ratings_df, movies_df, cosine_sim, alpha_values):
    results = []
    for alpha in alpha_values:
        recommended_titles = hybrid_recommend(
            user_id=user_id,
            liked_movie_id=liked_movie_id,
            model=model,
            ratings_df=ratings_df,
            movies_df=movies_df,
            cosine_sim=cosine_sim,
            alpha=alpha,
            top_n=10
        )

        recommended_ids = movies_df[movies_df['title'].isin(recommended_titles)]['movieId'].tolist()
        relevant_ids = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= 4.0)]['movieId'].tolist()

        precision, recall = precision_recall_at_k(recommended_ids, relevant_ids, k=10)
        results.append((alpha, precision, recall))

    return results

In [26]:
alpha_values = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
results = evaluate_hybrid(
    user_id=1,
    liked_movie_id=1,
    model=model,
    ratings_df=ratings,
    movies_df=movies,
    cosine_sim=cosine_sim,
    alpha_values=alpha_values
)

for alpha, precision, recall in results:
    print(f"Alpha: {alpha:.1f} | Precision@10: {precision:.2f} | Recall@10: {recall:.2f}")


Alpha: 0.0 | Precision@10: 0.00 | Recall@10: 0.00
Alpha: 0.2 | Precision@10: 0.00 | Recall@10: 0.00
Alpha: 0.4 | Precision@10: 0.00 | Recall@10: 0.00
Alpha: 0.6 | Precision@10: 0.00 | Recall@10: 0.00
Alpha: 0.8 | Precision@10: 0.00 | Recall@10: 0.00
Alpha: 1.0 | Precision@10: 0.00 | Recall@10: 0.00


In [28]:
recommendations = hybrid_recommend(
    user_id=1,
    liked_movie_id=1,  # Toy Story
    model=model,
    ratings_df=ratings,
    movies_df=movies,
    cosine_sim=cosine_sim,
    alpha=0.6  # Tune this!
)
# 2. Print or display results nicely
print("Hybrid Recommendations:")
for idx, title in enumerate(recommendations, 1):
    print(f"{idx}. {title}")
    


Hybrid Recommendations:
1. Wallace & Gromit: The Best of Aardman Animation (1996)
2. Toy Story 2 (1999)
3. Shrek (2001)
4. Monsters, Inc. (2001)
5. Finding Nemo (2003)
6. Incredibles, The (2004)
7. Kiki's Delivery Service (Majo no takkyûbin) (1989)
8. Howl's Moving Castle (Hauru no ugoku shiro) (2004)
9. Ponyo (Gake no ue no Ponyo) (2008)
10. Fantastic Mr. Fox (2009)
