# Front Matter

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

## Load

In [2]:
cbf = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS_340-Movies\Small MovieLens\cbf.csv')
cf = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS_340-Movies\Small MovieLens\cf.csv')

# Content-Based

## Initiating TF-IDF

In [3]:
# Initialize TF-IDF Vectorizer with English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'related' column
tfidf_matrix = tfidf.fit_transform(cbf['related'])

# Verify the shape of the TF-IDF matrix
print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")


TF-IDF Matrix Shape: (9742, 9933)


## Cosine Similarity

In [4]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display the similarity matrix shape (should be a square matrix: number of movies x number of movies)
print(cosine_sim.shape)

(9742, 9742)


In [5]:
# Create a function that takes in a movie title and gives recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    # Convert input title to lowercase
    title_cleaned = title.lower()

    # Find the index of the movie in the 'title_clean' column
    idx = cbf[cbf['title_clean'] == title_cleaned].index[0]

    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar movies
    sim_scores = sim_scores[1:11]  # Skip the first movie (itself)

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 10 most similar movies
    return cbf['title'].iloc[movie_indices]

# Example: Get recommendations for a movie

recommendations = get_recommendations('copycat')
print(recommendations)


4805                                      Monster (2003)
1615           Henry: Portrait of a Serial Killer (1986)
296                                    Virtuosity (1995)
8672                                 Killer Movie (2008)
3128                                    Manhunter (1986)
43                           Seven (a.k.a. Se7en) (1995)
4813    Aileen: Life and Death of a Serial Killer (2003)
465                                    Serial Mom (1994)
7940                                   Killer Joe (2011)
3461                                  Others, The (2001)
Name: title, dtype: object


# Collaborative

In [6]:
# Prepare the dataset (from your CF data)
# We'll use only the necessary columns
cf_reduced = cf[['userId', 'movieId', 'rating']]

# Define the format for the dataset using Reader
reader = Reader(rating_scale=(0.5, 5))  # Adjust the rating scale if necessary

# Load the data into Surprise format
data = Dataset.load_from_df(cf_reduced, reader)

# Train-test split (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize the SVD model for matrix factorization
svd = SVD()

# Train the model on the training set
svd.fit(trainset)

# Test the model on the test set and evaluate performance (RMSE)
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.8744


In [7]:
# Making movie recommendations for a specific user
def recommend_movies(user_id, model=svd, n_recommendations=10):
    # Get a list of all movie IDs
    all_movie_ids = cf['movieId'].unique()

    # Predict ratings for all movies the user hasn't rated yet
    user_rated_movies = cf[cf['userId'] == user_id]['movieId'].tolist()
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in user_rated_movies]

    # Predict ratings for unrated movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]

    # Sort predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get top N recommendations
    top_n = predictions[:n_recommendations]

    # Return movie IDs and estimated ratings
    recommended_movies = [(cf[cf['movieId'] == pred.iid]['title'].values[0], pred.est) for pred in top_n]

    return recommended_movies

# Example: Get recommendations for user 1
recommended_movies = recommend_movies(user_id=1)
print(recommended_movies)

[('Shawshank Redemption, The (1994)', 5), ('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 5), ('North by Northwest (1959)', 5), ('Casablanca (1942)', 5), ('Streetcar Named Desire, A (1951)', 5), ('Brazil (1985)', 5), ('Lawrence of Arabia (1962)', 5), ('Bridge on the River Kwai, The (1957)', 5), ('Great Escape, The (1963)', 5), ("Kelly's Heroes (1970)", 4.999732925753769)]


In [8]:
cross_val_results = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8748  0.8712  0.8740  0.8738  0.8747  0.8737  0.0013  
MAE (testset)     0.6735  0.6694  0.6701  0.6732  0.6717  0.6716  0.0016  
Fit time          2.00    2.57    2.43    2.32    3.43    2.55    0.48    
Test time         0.12    0.29    0.28    0.18    0.67    0.31    0.19    


In [9]:
# Display the average RMSE and MAE
print("\nCross-Validation Results:")
print(f"Average RMSE: {cross_val_results['test_rmse'].mean():.4f}")
print(f"Average MAE: {cross_val_results['test_mae'].mean():.4f}")


Cross-Validation Results:
Average RMSE: 0.8737
Average MAE: 0.6716


In [10]:
# Define a parameter grid for SVD
param_grid = {
    'n_factors': [50, 100, 150],  # Number of latent factors
    'n_epochs': [20, 30],         # Number of training epochs
    'lr_all': [0.002, 0.005],     # Learning rate for all parameters
    'reg_all': [0.02, 0.05]       # Regularization term for all parameters
}

# Initialize GridSearchCV with SVD algorithm
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, joblib_verbose=1)

# Perform grid search
print("\nStarting Grid Search for hyperparameter tuning...")
gs.fit(data)
print("Grid Search completed.")

# Extract the best RMSE score
print(f"\nBest RMSE Score: {gs.best_score['rmse']:.4f}")

# Extract the best parameters
print("Best parameters:")
print(gs.best_params['rmse'])


Starting Grid Search for hyperparameter tuning...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.5min


Grid Search completed.

Best RMSE Score: 0.8697
Best parameters:
{'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.05}


In [11]:
# Train the optimal model with best parameters
best_params = gs.best_params['rmse']
optimal_svd = SVD(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all']
)

print("\nTraining the optimized SVD model with best parameters...")
optimal_svd.fit(trainset)
print("Optimized model training completed.")


Training the optimized SVD model with best parameters...
Optimized model training completed.


In [12]:
# Predict on the test set using the optimized model
print("\nMaking predictions on the test set with the optimized model...")
optimal_predictions = optimal_svd.test(testset)

# Compute RMSE for the optimized model
optimal_rmse = accuracy.rmse(optimal_predictions)
print(f"Optimal Collaborative Filtering RMSE: {optimal_rmse:.4f}")


Making predictions on the test set with the optimized model...
RMSE: 0.8643
Optimal Collaborative Filtering RMSE: 0.8643


In [13]:
recommended_movies = recommend_movies(user_id=1, model = optimal_svd)
print(recommended_movies)

[('Shawshank Redemption, The (1994)', 5), ('Blade Runner (1982)', 5), ('Rear Window (1954)', 5), ('Streetcar Named Desire, A (1951)', 5), ('Paths of Glory (1957)', 5), ('Lawrence of Arabia (1962)', 5), ('Ran (1985)', 5), ('Grand Day Out with Wallace and Gromit, A (1989)', 5), ('Bridge on the River Kwai, The (1957)', 5), ("Guess Who's Coming to Dinner (1967)", 5)]


# Hybrid System

In [14]:
# Function to predict content-based rating for a given user and movie
def predict_cbf_rating(movie_id, cosine_sim_matrix, cbf_df, target_movie_index, user_movies, n_similar=10):
    # Get cosine similarity scores for the target movie
    sim_scores = list(enumerate(cosine_sim_matrix[target_movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top similar movies
    sim_scores = sim_scores[1:n_similar + 1]
    
    # Calculate the weighted sum of user ratings for similar movies
    weighted_sum = 0
    sim_sum = 0
    for idx, sim_score in sim_scores:
        movie_id_similar = cbf_df.iloc[idx]['movieId']
        if movie_id_similar in user_movies:
            weighted_sum += sim_score * user_movies[movie_id_similar]
            sim_sum += sim_score
    
    # Return the weighted average rating
    if sim_sum == 0:
        return np.mean(list(user_movies.values()))  # Default to user's average rating if no similar movies
    return weighted_sum / sim_sum


In [15]:
# Hybrid recommendation: Combine CF and CBF
def hybrid_recommendation(user_id, movie_id, cf_model, cosine_sim_matrix, cbf_df, cf_df, weight_cf=0.7, weight_cbf=0.3):
    # Get the predicted CF rating (Collaborative Filtering)
    cf_prediction = cf_model.predict(user_id, movie_id).est
    
    # Get the predicted CBF rating (Content-Based Filtering)
    # First, get the movie index in CBF
    target_movie_index = cbf_df[cbf_df['movieId'] == movie_id].index[0]
    
    # Get the movies the user has rated
    user_ratings = cf_df[cf_df['userId'] == user_id].set_index('movieId')['rating'].to_dict()
    
    # Get the CBF rating prediction
    cbf_prediction = predict_cbf_rating(movie_id, cosine_sim_matrix, cbf_df, target_movie_index, user_ratings)
    
    # Combine the predictions using the weights
    final_rating = (weight_cf * cf_prediction) + (weight_cbf * cbf_prediction)
    
    return final_rating


In [16]:
def recommend_hybrid_movies(user_id, cf_model, cosine_sim_matrix, cbf_df, cf_df, n_recommendations=10, weight_cf=0.7, weight_cbf=0.3):
    all_movie_ids = cf_df['movieId'].unique()
    user_rated_movies = cf_df[cf_df['userId'] == user_id]['movieId'].tolist()
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in user_rated_movies]
    
    predictions = []
    
    for movie_id in unrated_movies:
        # Predict the hybrid rating
        predicted_rating = hybrid_recommendation(user_id, movie_id, cf_model, cosine_sim_matrix, cbf_df, cf_df, weight_cf, weight_cbf)
        predictions.append((movie_id, predicted_rating))
    
    # Sort the movies by the predicted hybrid rating and get the top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_n_predictions = predictions[:n_recommendations]
    
    # Return the top N recommended movie titles
    return [(cbf_df[cbf_df['movieId'] == movie_id]['title'].values[0], rating) for movie_id, rating in top_n_predictions]




In [17]:
# Example: Get hybrid recommendations for user 1
recommendations = recommend_hybrid_movies(user_id=1, cf_model=optimal_svd, cosine_sim_matrix=cosine_sim, cbf_df=cbf, cf_df=cf)
recommendations

[('Shawshank Redemption, The (1994)', 5.0),
 ('Blade Runner (1982)', 5.0),
 ('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  4.987950794888464),
 ('Three Billboards Outside Ebbing, Missouri (2017)', 4.980658037046629),
 ('3:10 to Yuma (2007)', 4.950182052835955),
 ('Chinatown (1974)', 4.944970097554062),
 ("Man Bites Dog (C'est arrivé près de chez vous) (1992)", 4.926534814133654),
 ('French Connection, The (1971)', 4.910453528705336),
 ('Thank You for Smoking (2006)', 4.909367885819924),
 ('Bonnie and Clyde (1967)', 4.907585855603942)]

## Validation

In [49]:
weight_cf = 0.4
weight_cbf = 1 - weight_cf

In [50]:
def generate_hybrid_predictions(testset, cf_model, cosine_sim_matrix, cbf_df, cf_df, weight_cf, weight_cbf):
    predictions = []
    for user_id, movie_id, true_rating in testset:
        # Predict the hybrid rating
        pred_rating = hybrid_recommendation(user_id, movie_id, cf_model, cosine_sim_matrix, cbf_df, cf_df, weight_cf, weight_cbf)
        predictions.append((user_id, movie_id, true_rating, pred_rating))
    return predictions
# Example: Generating predictions for a test set
hybrid_predictions = generate_hybrid_predictions(testset, optimal_svd, cosine_sim, cbf, cf, weight_cf, weight_cbf)


### RMSE and MAE

In [51]:

# Modified HybridPrediction class to mimic surprise's Prediction object
class HybridPrediction:
    def __init__(self, uid, iid, r_ui, est):
        self.uid = uid  # User ID
        self.iid = iid  # Movie ID
        self.r_ui = r_ui  # True rating
        self.est = est  # Predicted rating
        self.details = {}  # Can be empty, but required by surprise's accuracy functions

    # Making the object iterable like Surprise's Prediction class
    def __iter__(self):
        return iter((self.uid, self.iid, self.r_ui, self.est, self.details))
    
def calculate_rmse_mae(hybrid_predictions):
    # Create list of HybridPrediction objects
    surprise_predictions = [HybridPrediction(uid, iid, r_ui, est) for (uid, iid, r_ui, est) in hybrid_predictions]

    # RMSE
    rmse = accuracy.rmse(surprise_predictions, verbose=False)

    # MAE
    mae = accuracy.mae(surprise_predictions, verbose=False)

    return rmse, mae

In [52]:
rmse, mae = calculate_rmse_mae(hybrid_predictions)


### F1, Precision, Recall

In [53]:
# Step 1: Prepare y_true and y_pred for binary classification
threshold = 4.5  # Set threshold for relevance

# Generate binary labels based on true ratings and predicted ratings

# Step 2: Calculate Precision, Recall, and F1-Score
def calculate_precision_recall_f1(threshold, predictions): #
    y_true = [int(true_rating >= threshold) for (_, _, true_rating, _) in predictions]  # Actual ratings
    y_pred = [int(pred_rating >= threshold) for (_, _, _, pred_rating) in predictions]  # Predicted ratings
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1
    
precision, recall, f1 = calculate_precision_recall_f1(threshold, hybrid_predictions)

In [54]:
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

RMSE: 0.8896243237162257
MAE: 0.6746912282622457
Precision: 0.7220708446866485
Recall: 0.12226066897347174
F1-Score: 0.20911422371276386
