## Step 1: Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from math import sqrt

import requests

from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer


## Step 2: Data Preparation

In [2]:
# Load u.data file
ratings_data = pd.read_csv('files/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load u.item file
columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action',
           'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies_data = pd.read_csv('files/u.item', sep='|', names=columns, encoding='latin-1')

# Extract relevant columns from movies_data
movies_data = movies_data[['movie_id', 'movie_title', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
                           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                           'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]

# Load movies_with_youtube_ids DataFrame
movies_with_youtube_ids = pd.read_csv('files/movies_with_youtube_ids.csv')

# Load comments_data DataFrame
comments_data = pd.read_csv('files/youtube_comments.csv')


In [3]:
ratings_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movies_data.head()

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
movies_with_youtube_ids.head()

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,youtube_video_id
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,v-PjgYDrg70
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,lcOqUE0u1LM
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0hu12MP7b1U
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,t2QcA-KoF5s
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,5Pp64srMAH4


In [6]:
comments_data.head()

Unnamed: 0,youtube_video_id,comment_text
0,v-PjgYDrg70,Hooo
1,lcOqUE0u1LM,"Merveilleux film, je l'ai vu depuis des année..."
2,0hu12MP7b1U,Its a perfect film.
3,t2QcA-KoF5s,Kevin Spacey - winning Oscar Best Actor for th...
4,5Pp64srMAH4,Just finished it and i said to myself after se...


In [7]:
ratings_data.shape

(100000, 4)

In [8]:
movies_data.shape

(1682, 21)

In [9]:
movies_with_youtube_ids.shape

(1682, 22)

In [10]:
comments_data.shape

(1682, 2)

# Step 3: Collaborative filtering-based recommender system
Algorithm used: Matrix factorization

In [11]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=42)

# Create user-item rating matrix for training data
n_users = ratings_data['user_id'].nunique()
n_items = ratings_data['item_id'].nunique()
train_matrix = np.zeros((n_users, n_items))
for row in train_data.itertuples():
    train_matrix[row[1] - 1, row[2] - 1] = row[3]

# Perform Singular Value Decomposition (SVD) on the training matrix
k = 50
U, sigma, Vt = np.linalg.svd(train_matrix)

# Take only the first k columns of U and first k rows of Vt to approximate the original matrix
U_k = U[:, :k]
Vt_k = Vt[:k, :]

# Reconstruct the predicted rating matrix using the approximated matrices
predicted_ratings = np.dot(np.dot(U_k, np.diag(sigma[:k])), Vt_k)

# Predicted ratings matrix may contain negative values, so clip them to the range [0, 5]
predicted_ratings = np.clip(predicted_ratings, 0, 5)

# Evaluation: Calculate Root Mean Squared Error (RMSE) for the test data
test_matrix = np.zeros((n_users, n_items))
for row in test_data.itertuples():
    test_matrix[row[1] - 1, row[2] - 1] = row[3]

# Keep only the non-zero entries to calculate RMSE
non_zero_idx = test_matrix.nonzero()
rmse = sqrt(mean_squared_error(test_matrix[non_zero_idx], predicted_ratings[non_zero_idx]))

print(f"Root Mean Squared Error (RMSE) for Collaborative Filtering: {rmse:.4f}")


Root Mean Squared Error (RMSE) for Collaborative Filtering: 2.8344


### Loading Data and Getting Top 10 Movies with Highest Predicted Ratings



In [12]:
# Load movie titles from u.item and calculate average predicted rating for each movie
movies_info = pd.read_csv('files/u.item', sep='|', encoding='latin-1', header=None, names=['item_id', 'movie_title'], usecols=[0, 1])
movies_info['item_id'] = movies_info['item_id'].astype(int)
movies_info['Average Predicted Rating'] = predicted_ratings.mean(axis=0)

# Sort the movies based on their average predicted ratings in descending order
top_10_movies = movies_info.sort_values(by='Average Predicted Rating', ascending=False).head(10)

# Function to display a DataFrame in a table format
def display_table(data_frame, columns):
    print(data_frame[columns])

# Display the top 10 movies
print("Top 10 Movies with Highest Predicted Ratings for All Users:")
display_table(top_10_movies, ['item_id', 'movie_title', 'Average Predicted Rating'])


Top 10 Movies with Highest Predicted Ratings for All Users:
     item_id                       movie_title  Average Predicted Rating
49        50                  Star Wars (1977)                  2.101643
99       100                      Fargo (1996)                  1.824366
180      181         Return of the Jedi (1983)                  1.731883
257      258                    Contact (1997)                  1.661843
126      127             Godfather, The (1972)                  1.575842
285      286       English Patient, The (1996)                  1.549842
97        98  Silence of the Lambs, The (1991)                  1.505015
0          1                  Toy Story (1995)                  1.467114
287      288                     Scream (1996)                  1.457047
173      174    Raiders of the Lost Ark (1981)                  1.451527


### Function to Get Top N Recommended Movies for a User

In [13]:
# Function to get top N recommended movies for a user
def get_top_n_movies(user_id, n=10):
    user_row = predicted_ratings[user_id - 1]
    sorted_indices = np.argsort(user_row)[::-1]  # Sort in descending order
    top_indices = sorted_indices[:n]
    top_movie_ids = [i + 1 for i in top_indices]  # Movie ids start from 1 in the dataset
    top_movies = movies_info[movies_info['item_id'].isin(top_movie_ids)].copy()
    top_movies['Predicted Rating'] = user_row[top_indices]
    top_movies.reset_index(drop=True, inplace=True)
    return top_movies[['item_id', 'movie_title', 'Predicted Rating']]

# Get top 10 recommended movies for user with ID 1 (you can change the user ID as per your choice)
top_10_movies = get_top_n_movies(user_id=1, n=10)

# Display the top 10 recommended movies in a table format
print("Top 10 Recommended Movies:")
print(top_10_movies)


Top 10 Recommended Movies:
   item_id                             movie_title  Predicted Rating
0       12              Usual Suspects, The (1995)               5.0
1       50                        Star Wars (1977)               5.0
2       96       Terminator 2: Judgment Day (1991)               5.0
3       98        Silence of the Lambs, The (1991)               5.0
4      168  Monty Python and the Holy Grail (1974)               5.0
5      175                           Brazil (1985)               5.0
6      183                            Alien (1979)               5.0
7      202                    Groundhog Day (1993)               5.0
8      216          When Harry Met Sally... (1989)               5.0
9      268                      Chasing Amy (1997)               5.0


### Getting 10 Least Recommended Movies

In [14]:
# Sort the movies based on their average predicted ratings in ascending order
bottom_10_movies = movies_info.sort_values(by='Average Predicted Rating').head(10)

# Display the bottom 10 movies in a table format
print("10 Least Recommended Movies:")
print(bottom_10_movies[['item_id', 'movie_title', 'Average Predicted Rating']])


10 Least Recommended Movies:
      item_id                                   movie_title  \
1681     1682     Scream of Stone (Schrei aus Stein) (1991)   
1351     1352  Shadow of Angels (Schatten der Engel) (1976)   
1363     1364                           Bird of Prey (1996)   
1648     1649                           Big One, The (1997)   
1647     1648                       Niagara, Niagara (1997)   
1639     1640                        Eighth Day, The (1996)   
1432     1433                           Men of Means (1998)   
1459     1460                              Sleepover (1995)   
1636     1637                             Girls Town (1996)   
1492     1493                       Modern Affair, A (1995)   

      Average Predicted Rating  
1681                       0.0  
1351                       0.0  
1363                       0.0  
1648                       0.0  
1647                       0.0  
1639                       0.0  
1432                       0.0  
1459         

## Step 4: Sentiment Analysis on Youtube Movie Comments

In [15]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neele\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neele\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\neele\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
# Function to preprocess and clean text data
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalnum()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Join the cleaned words
    cleaned_text = ' '.join(words)
    return cleaned_text

# Function to calculate sentiment scores using TextBlob
def calculate_textblob_sentiment(text):
    text = preprocess_text(text)
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

# Function to calculate sentiment scores using VADER (Valence Aware Dictionary and sEntiment Reasoner)
def calculate_vader_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)['compound']
    return sentiment

# Calculate sentiment scores for comments and return average sentiment score
def calculate_average_sentiment(comments, sentiment_function):
    sentiment_scores = []
    for comment in comments:
        sentiment = sentiment_function(comment)
        sentiment_scores.append(sentiment)

    if sentiment_scores:
        average_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    else:
        average_sentiment = 0.0

    return average_sentiment

# Calculate average sentiment scores using TextBlob and add them to the DataFrame
movies_with_sentiment_textblob = movies_with_youtube_ids.copy()
movies_with_sentiment_textblob['Average Sentiment (TextBlob)'] = movies_with_youtube_ids['youtube_video_id'].apply(
    lambda vid_id: calculate_average_sentiment(
        comments_data[comments_data['youtube_video_id'] == vid_id]['comment_text'], calculate_textblob_sentiment)
)

# Calculate average sentiment scores using VADER and add them to the DataFrame
movies_with_sentiment_vader = movies_with_youtube_ids.copy()
movies_with_sentiment_vader['Average Sentiment (VADER)'] = movies_with_youtube_ids['youtube_video_id'].apply(
    lambda vid_id: calculate_average_sentiment(
        comments_data[comments_data['youtube_video_id'] == vid_id]['comment_text'], calculate_vader_sentiment)
)

# Display the sentiment analysis results
print("Movies with Average Sentiment Scores (TextBlob):")
print(movies_with_sentiment_textblob[['movie_title', 'Average Sentiment (TextBlob)']])

print("\nMovies with Average Sentiment Scores (VADER):")
print(movies_with_sentiment_vader[['movie_title', 'Average Sentiment (VADER)']])


Movies with Average Sentiment Scores (TextBlob):
                                    movie_title  Average Sentiment (TextBlob)
0                              Toy Story (1995)                      0.000000
1                              GoldenEye (1995)                      0.000000
2                             Four Rooms (1995)                      1.000000
3                             Get Shorty (1995)                      0.750000
4                                Copycat (1995)                      0.308333
...                                         ...                           ...
1677                          Mat' i syn (1997)                     -0.019048
1678                           B. Monkey (1998)                      0.000000
1679                       Sliding Doors (1998)                      0.100000
1680                        You So Crazy (1994)                      0.285714
1681  Scream of Stone (Schrei aus Stein) (1991)                     -0.400000

[1682 rows x 2

# Step 5: Hybrid Recommender System
Using only TextBlob Sentimental Analysis as the Vader was giving us the Memory Error

In [17]:
# Calculate average sentiment scores using TextBlob and add them to the DataFrame
movies_with_sentiment_textblob = pd.DataFrame({
    'movie_title': movies_with_youtube_ids['movie_title'],
    'Average Sentiment (TextBlob)': movies_with_youtube_ids['youtube_video_id'].apply(
        lambda vid_id: calculate_average_sentiment(
            comments_data[comments_data['youtube_video_id'] == vid_id]['comment_text'], calculate_textblob_sentiment)
    )
})

# Merge sentiment scores into movies_data
movies_data = pd.merge(movies_data, movies_with_sentiment_textblob, on='movie_title', how='left')

# Update the user-item matrix with sentiment scores (adjust the features accordingly)
train_matrix_with_sentiment = np.zeros((n_users, n_items + 1))  # +1 for sentiment score
for row in train_data.itertuples():
    user_idx = row[1] - 1
    item_idx = row[2] - 1
    sentiment_score = movies_data.loc[item_idx, 'Average Sentiment (TextBlob)']
    train_matrix_with_sentiment[user_idx, item_idx] = row[3]
    train_matrix_with_sentiment[user_idx, n_items] = sentiment_score

# Perform Singular Value Decomposition (SVD) on the updated training matrix with sentiment scores
k = 50
U, sigma, Vt = np.linalg.svd(train_matrix_with_sentiment)

# Take only the first k columns of U and first k rows of Vt to approximate the original matrix
U_k = U[:, :k]
Vt_k = Vt[:k, :]

# Reconstruct the predicted rating matrix using the approximated matrices
predicted_ratings_with_sentiment = np.dot(np.dot(U_k, np.diag(sigma[:k])), Vt_k)

# Predicted ratings matrix may contain negative values, so clip them to the range [0, 5]
predicted_ratings_with_sentiment = np.clip(predicted_ratings_with_sentiment, 0, 5)

# Evaluation: Calculate Root Mean Squared Error (RMSE) for the test data
test_matrix_with_sentiment = np.zeros((n_users, n_items))
for row in test_data.itertuples():
    test_matrix_with_sentiment[row[1] - 1, row[2] - 1] = row[3]

# Keep only the non-zero entries to calculate RMSE
non_zero_idx_with_sentiment = test_matrix_with_sentiment.nonzero()
rmse_with_sentiment = sqrt(mean_squared_error(test_matrix_with_sentiment[non_zero_idx_with_sentiment],
                                              predicted_ratings_with_sentiment[non_zero_idx_with_sentiment]))

print(f"RMSE for Hybrid Recommender System with Sentiment Analysis: {rmse_with_sentiment:.4f}")


RMSE for Hybrid Recommender System with Sentiment Analysis: 2.8344
