**Step 1:** Install the Required Libraries


In [1]:
!pip install surprise tensorrec nltk textblob gensim vaderSentiment spacy transformers openai
!python -m spacy download en_core_web_sm


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting tensorrec
  Downloading tensorrec-0.26.2-py3-none-any.whl.metadata (491 bytes)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting openai
  Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting six==1.11.0 (from tensorrec)
  Downloading six-1.11.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from ope

**Step 2:** Import Necessary Libraries


In [44]:
import surprise
import tensorrec
import nltk
from textblob import TextBlob
import gensim
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from transformers import pipeline
import openai


**1. Surprise:** Basic Collaborative Filtering for Recommendations


In [45]:
from surprise import Dataset, SVD, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate


#Example1: Simple CF model:
print("\nExample1:")
# Automatically download the dataset without prompt
data = Dataset.load_builtin('ml-100k', prompt=False)

# Use SVD for collaborative filtering
algo = SVD()

# Perform cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)



#Example2: More Advanced model:
print("\nExample2:")

# Step 1: Load the MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k', prompt=False)

# Step 2: Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Step 3: Use Grid Search for hyperparameter tuning on SVD
param_grid = {
    'n_factors': [50, 100, 150],   # Number of latent factors
    'n_epochs': [20, 30],          # Number of epochs
    'lr_all': [0.002, 0.005],      # Learning rate for all parameters
    'reg_all': [0.02, 0.1]         # Regularization term
}

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Step 4: Display the best score and corresponding hyperparameters
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best Parameters: {gs.best_params['rmse']}")

# Step 5: Train the model with the best hyperparameters on the full training set
best_algo = gs.best_estimator['rmse']
best_algo.fit(trainset)

# Step 6: Test the model on the test set
predictions = best_algo.test(testset)

# Evaluate the model performance on test set
print("\nTest Set Performance:")
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Step 7: Cross-validate the best model using 5-fold cross-validation
print("\nCross-validation Performance:")
cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Step 8: Show detailed predictions for test data
print("\nDetailed Predictions for Test Set:")
for pred in predictions[:5]:  # Limit to first 5 predictions for readability
    print(f"User: {pred.uid}, Item: {pred.iid}, True Rating: {pred.r_ui}, Predicted Rating: {pred.est:.2f}")



Example1:
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9406  0.9300  0.9320  0.9380  0.9420  0.9365  0.0047  
MAE (testset)     0.7424  0.7318  0.7373  0.7390  0.7438  0.7388  0.0042  
Fit time          1.39    1.43    1.42    1.41    1.44    1.42    0.02    
Test time         0.12    0.11    0.15    0.13    0.12    0.13    0.01    

Example2:
Best RMSE: 0.9346051755495527
Best Parameters: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}

Test Set Performance:
RMSE: 0.9244
MAE:  0.7309

Cross-validation Performance:
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9198  0.9331  0.9279  0.9291  0.9182  0.9256  0.0057  
MAE (testset)     0.7311  0.7385  0.7359  0.7355  0.7250  0.7332  0.0047  
Fit time          3.06    3.86    2.87    2.78    3.39    3.19    0.40    


**2. NLTK:** Recommendation with Text Processing Using NLTK


In [46]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from surprise import Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Download necessary NLTK data
nltk.download('punkt')

# Step 1: Load the built-in MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')

# Step 2: Movie titles from MovieLens dataset
# (Assuming the dataset includes a file where movie titles are stored)
# You would need to load the 'u.item' file from the dataset, which contains the movie titles.
movie_titles = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item',
                           sep='|',
                           encoding='latin-1',
                           header=None,
                           usecols=[0, 1],
                           names=['itemId', 'title'])

# Step 3: Tokenize movie titles using NLTK
def tokenize_and_count(title):
    # Tokenize the movie title
    tokens = word_tokenize(title.lower())
    # Count the frequency of each token
    word_freq = Counter(tokens)
    return word_freq

# Apply tokenization and word frequency calculation to movie titles
movie_word_frequencies = {row['itemId']: tokenize_and_count(row['title']) for idx, row in movie_titles.iterrows()}

# Simplified representation: total number of tokens for each movie title
movie_token_counts = {item: sum(freq.values()) for item, freq in movie_word_frequencies.items()}

# Step 4: Prepare the dataset for Surprise
# We do not need to modify the built-in dataset directly, but we can use the token counts later for analysis.

# Step 5: Create train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Step 6: Define and train the SVD model
model = SVD()
model.fit(trainset)

# Step 7: Predict ratings for the test set
predictions = model.test(testset)

# Step 8: Evaluate the model's performance
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

# Example of predicting for new user-item pairs (Note: items are referred to by their movieId from the dataset)
new_user_items = [(1, 50), (2, 100)]  # Predicting for User 1 with Movie 50 and User 2 with Movie 100
predictions = [model.predict(uid, iid) for (uid, iid) in new_user_items]

for prediction in predictions:
    print(f"User {prediction.uid} - Movie {prediction.iid}: Predicted Rating = {prediction.est:.2f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


RMSE: 0.9360
MAE:  0.7392
RMSE: 0.94
MAE: 0.74
User 1 - Movie 50: Predicted Rating = 3.53
User 2 - Movie 100: Predicted Rating = 3.53


**3. TextBlob:** Surprise with Sentiment Analysis Using TextBlob


In [48]:
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy  # Correct import for accuracy
from textblob import TextBlob

# Example user-item interactions (rows: users, columns: items)
user_item_interactions = pd.DataFrame({
    'userId': [1, 1, 2, 2, 3],
    'itemId': [1, 2, 2, 3, 1],
    'rating': [5, 3, 4, 2, 4]
})

# Example item reviews (dummy data for demonstration)
item_reviews = {
    1: "I love this product. It is fantastic!",
    2: "This is a great product, very useful.",
    3: "Not bad, but could be improved."
}

# Convert reviews to sentiment scores
def get_sentiment_score(review):
    return TextBlob(review).sentiment.polarity

# Calculate sentiment scores
item_sentiment_scores = {item: get_sentiment_score(review) for item, review in item_reviews.items()}

# Add sentiment scores to user-item interactions
user_item_interactions['sentiment'] = user_item_interactions['itemId'].map(item_sentiment_scores)

# Prepare the dataset for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_item_interactions[['userId', 'itemId', 'rating']], reader)

# Step 2: Create train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Step 3: Define and train the model
model = SVD()
model.fit(trainset)

# Step 4: Predict ratings for the test set
predictions = model.test(testset)

# Step 5: Evaluate the model's performance
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Example of predicting for new user-item pairs
new_user_items = [(1, 2), (3, 1)]  # User 1 for Item 2, User 3 for Item 1
predictions = [model.predict(uid, iid) for (uid, iid) in new_user_items]

for prediction in predictions:
    print(f"User {prediction.uid} - Item {prediction.iid}: Predicted Rating = {prediction.est:.2f}")


RMSE: 0.9325
MAE:  0.9325
User 1 - Item 2: Predicted Rating = 3.93
User 3 - Item 1: Predicted Rating = 3.91


**4. Gensim:** Hybrid Recommender System Using KNN and Word2Vec using Gensim


In [49]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic, accuracy
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np

# Load the dataset
data = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 3],
    'item_id': [1, 2, 3, 1, 2, 2, 3, 4],
    'rating': [5, 3, 2, 4, 5, 1, 2, 4],
    'description': [
        'This is the first item description.',
        'Second item description here.',
        'Another description for the third item.',
        'This is the first item description.',
        'Second item description here.',
        'Second item description here.',
        'Another description for the third item.',
        'Fourth item description is different.'
    ]
}

df = pd.DataFrame(data)

# Use Surprise to load the dataset
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(surprise_data, test_size=0.25)

# Train a basic KNN recommender
algo = KNNBasic()
algo.fit(trainset)

# Evaluate the recommender
predictions = algo.test(testset)
print(f"RMSE: {accuracy.rmse(predictions)}")

# Preprocess descriptions for Word2Vec
descriptions = [simple_preprocess(desc) for desc in df['description']]

# Train Word2Vec model
model = Word2Vec(sentences=descriptions, vector_size=50, window=5, min_count=1, workers=4)

# Function to get item vector
def get_item_vector(item_id):
    item_description = df[df['item_id'] == item_id]['description'].values[0]
    return np.mean([model.wv[word] for word in simple_preprocess(item_description) if word in model.wv], axis=0)

# Calculate similarity between items
def calculate_similarity(item_id1, item_id2):
    vector1 = get_item_vector(item_id1)
    vector2 = get_item_vector(item_id2)
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

# Example: Calculate similarity between item 1 and item 2
similarity = calculate_similarity(1, 2)
print(f"Similarity between item 1 and item 2: {similarity}")

# Enhance recommendations using item similarity
def enhanced_recommendations(user_id, top_n=5):
    user_ratings = df[df['user_id'] == user_id]
    item_ids = user_ratings['item_id'].tolist()
    scores = {}

    for item_id in item_ids:
        for other_item_id in df['item_id'].unique():
            if other_item_id != item_id:
                similarity = calculate_similarity(item_id, other_item_id)
                scores[other_item_id] = scores.get(other_item_id, 0) + similarity * user_ratings[user_ratings['item_id'] == item_id]['rating'].values[0]

    recommended_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [item[0] for item in recommended_items]

# Get enhanced recommendations for user 1
recommendations = enhanced_recommendations(1)
print(f"Recommendations for user 1: {recommendations}")


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 13 word types from a corpus of 41 raw words and 8 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 13 unique words (100.00% of original 13, drops 0)', 'datetime': '2024-09-23T08:28:53.214828', 'gensim': '4.3.3', 'python': '3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 41 word corpus (100.00% of original 41, drops 0)', 'datetime': '2024-09-23T08:28:53.218335', 'gensim': '4.3.3', 'python': '3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}
INFO:gensim.mode

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.4714
RMSE: 0.47140452079103157


INFO:gensim.models.word2vec:EPOCH 0: training on 41 raw words (4 effective words) took 0.0s, 5252 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 41 raw words (2 effective words) took 0.0s, 5028 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 41 raw words (5 effective words) took 0.0s, 9953 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 41 raw words (6 effective words) took 0.0s, 15174 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 41 raw words (3 effective words) took 0.0s, 8784 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 205 raw words (20 effective words) took 0.1s, 262 effective words/s', 'datetime': '2024-09-23T08:28:53.350170', 'gensim': '4.3.3', 'python': '3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}
INFO:gensim.utils:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=13, vector_size=50, a

Similarity between item 1 and item 2: 0.3207181394100189
Recommendations for user 1: [4, 3, 2, 1]


**5. VADER Sentiment Analysis:** VADER Sentiment Analysis for Recommendations

In [50]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from surprise import Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import Reader  # Import Reader from surprise
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Load the built-in MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')

# Step 2: Movie titles from MovieLens dataset (you can add reviews if available)
movie_titles = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item',
                           sep='|',
                           encoding='latin-1',
                           header=None,
                           usecols=[0, 1],
                           names=['itemId', 'title'])

# Step 3: Simulate a review dataset (or you can use actual movie reviews if available)
# For simplicity, we'll assign a fake review text to each movie
fake_reviews = [
    "I love this movie, it's amazing and thrilling!",
    "Not bad, but could have been better.",
    "Terrible movie, not worth watching.",
    "A masterpiece, truly inspiring.",
    "It was okay, nothing special."
]

# Ensure we have a review for each movie by repeating and trimming the reviews list
movie_titles['review'] = fake_reviews * (len(movie_titles) // len(fake_reviews)) + fake_reviews[:len(movie_titles) % len(fake_reviews)]

# Step 4: Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Step 5: Perform sentiment analysis on the reviews
def get_sentiment_score(review):
    sentiment = analyzer.polarity_scores(review)
    return sentiment['compound']  # Use compound score as overall sentiment

movie_titles['sentiment'] = movie_titles['review'].apply(get_sentiment_score)

# Step 6: Map sentiment scores to the user-item interaction data
user_item_interactions = pd.DataFrame({
    'userId': [1, 1, 2, 2, 3],
    'itemId': [1, 2, 2, 3, 1],
    'rating': [5, 3, 4, 2, 4]
})

# Add the sentiment score for each movie as a feature
user_item_interactions['sentiment'] = user_item_interactions['itemId'].map(movie_titles.set_index('itemId')['sentiment'])

# Step 7: Prepare the dataset for Surprise
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(user_item_interactions[['userId', 'itemId', 'rating']], reader=reader)

# Step 8: Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Step 9: Define and train the SVD model
model = SVD()
model.fit(trainset)

# Step 10: Predict ratings for the test set
predictions = model.test(testset)

# Step 11: Evaluate the model's performance
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

# Example of predicting for new user-item pairs (Note: items are referred to by their movieId from the dataset)
new_user_items = [(1, 50), (2, 100)]  # Predicting for User 1 with Movie 50 and User 2 with Movie 100
predictions = [model.predict(uid, iid) for (uid, iid) in new_user_items]

for prediction in predictions:
    print(f"User {prediction.uid} - Movie {prediction.iid}: Predicted Rating = {prediction.est:.2f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


RMSE: 0.9233
MAE:  0.9233
RMSE: 0.92
MAE: 0.92
User 1 - Movie 50: Predicted Rating = 3.85
User 2 - Movie 100: Predicted Rating = 3.64


**6. spaCy:** Named Entity Recognition (NER) with spaCy for Recommendations

In [51]:
import pandas as pd
import spacy
from surprise import Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import Reader
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# Step 1: Load the built-in MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')

# Step 2: Movie titles from MovieLens dataset (you can add reviews if available)
movie_titles = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item',
                           sep='|',
                           encoding='latin-1',
                           header=None,
                           usecols=[0, 1],
                           names=['itemId', 'title'])

# Step 3: Simulate a review dataset (or use actual movie reviews if available)
# For simplicity, we'll assign a fake review text to each movie
fake_reviews = [
    "I love this movie, it's amazing and thrilling!",
    "Not bad, but could have been better.",
    "Terrible movie, not worth watching.",
    "A masterpiece, truly inspiring.",
    "It was okay, nothing special."
]

# Ensure we have a review for each movie by repeating and trimming the reviews list
movie_titles['review'] = fake_reviews * (len(movie_titles) // len(fake_reviews)) + fake_reviews[:len(movie_titles) % len(fake_reviews)]

# Step 4: Load spaCy's pre-trained model for English NER
nlp = spacy.load('en_core_web_sm')

# Step 5: Perform Named Entity Recognition (NER) on reviews
def extract_named_entities(review):
    doc = nlp(review)
    entities = [ent.label_ for ent in doc.ents]  # Get entity labels (like 'PERSON', 'ORG', etc.)
    return " ".join(entities) if entities else "NONE"  # Return entities or 'NONE' if no entities found

# Apply NER to each review and store the result in a new column
movie_titles['entities'] = movie_titles['review'].apply(extract_named_entities)

# Step 6: Map extracted entities to the user-item interaction data
user_item_interactions = pd.DataFrame({
    'userId': [1, 1, 2, 2, 3],
    'itemId': [1, 2, 2, 3, 1],
    'rating': [5, 3, 4, 2, 4]
})

# Add the extracted named entities for each movie as a feature
user_item_interactions['entities'] = user_item_interactions['itemId'].map(movie_titles.set_index('itemId')['entities'])

# Step 7: Prepare the dataset for Surprise
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(user_item_interactions[['userId', 'itemId', 'rating']], reader=reader)

# Step 8: Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Step 9: Define and train the SVD model
model = SVD()
model.fit(trainset)

# Step 10: Predict ratings for the test set
predictions = model.test(testset)

# Step 11: Evaluate the model's performance
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

# Example of predicting for new user-item pairs (Note: items are referred to by their movieId from the dataset)
new_user_items = [(1, 50), (2, 100)]  # Predicting for User 1 with Movie 50 and User 2 with Movie 100
predictions = [model.predict(uid, iid) for (uid, iid) in new_user_items]

for prediction in predictions:
    print(f"User {prediction.uid} - Movie {prediction.iid}: Predicted Rating = {prediction.est:.2f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


RMSE: 0.5827
MAE:  0.5827
RMSE: 0.58
MAE: 0.58
User 1 - Movie 50: Predicted Rating = 3.58
User 2 - Movie 100: Predicted Rating = 3.39


**7. Transformers from Hugging Face:** Sentiment Analysis with Transformers for Recommendation

In [52]:
import pandas as pd
from transformers import pipeline
from surprise import Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import Reader

# Step 1: Load the built-in MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')

# Step 2: Movie titles from MovieLens dataset (you can add reviews if available)
movie_titles = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.item',
                           sep='|',
                           encoding='latin-1',
                           header=None,
                           usecols=[0, 1],
                           names=['itemId', 'title'])

# Step 3: Simulate a review dataset (or use actual movie reviews if available)
fake_reviews = [
    "I love this movie, it's amazing and thrilling!",
    "Not bad, but could have been better.",
    "Terrible movie, not worth watching.",
    "A masterpiece, truly inspiring.",
    "It was okay, nothing special."
]

# Ensure we have a review for each movie by repeating and trimming the reviews list
movie_titles['review'] = fake_reviews * (len(movie_titles) // len(fake_reviews)) + fake_reviews[:len(movie_titles) % len(fake_reviews)]

# Step 4: Use Hugging Face Transformers for text classification
classifier = pipeline("sentiment-analysis")

# Step 5: Perform sentiment classification on the reviews
def classify_sentiment(review):
    result = classifier(review)[0]  # Get the first result
    return result['label']  # Return the sentiment label (e.g., 'POSITIVE' or 'NEGATIVE')

# Apply sentiment classification to each review and store the result in a new column
movie_titles['sentiment'] = movie_titles['review'].apply(classify_sentiment)

# Step 6: Map sentiment scores to the user-item interaction data
user_item_interactions = pd.DataFrame({
    'userId': [1, 1, 2, 2, 3],
    'itemId': [1, 2, 2, 3, 1],
    'rating': [5, 3, 4, 2, 4]
})

# Add the sentiment classification for each movie as a feature
user_item_interactions['sentiment'] = user_item_interactions['itemId'].map(movie_titles.set_index('itemId')['sentiment'])

# Step 7: Prepare the dataset for Surprise
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(user_item_interactions[['userId', 'itemId', 'rating']], reader=reader)

# Step 8: Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Step 9: Define and train the SVD model
model = SVD()
model.fit(trainset)

# Step 10: Predict ratings for the test set
predictions = model.test(testset)

# Step 11: Evaluate the model's performance
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

# Example of predicting for new user-item pairs (Note: items are referred to by their movieId from the dataset)
new_user_items = [(1, 50), (2, 100)]  # Predicting for User 1 with Movie 50 and User 2 with Movie 100
predictions = [model.predict(uid, iid) for (uid, iid) in new_user_items]

for prediction in predictions:
    print(f"User {prediction.uid} - Movie {prediction.iid}: Predicted Rating = {prediction.est:.2f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



RMSE: 0.7640
MAE:  0.7640
RMSE: 0.76
MAE: 0.76
User 1 - Movie 50: Predicted Rating = 3.59
User 2 - Movie 100: Predicted Rating = 3.37
