Task 1: Create a Recommendation system

Parse movies.csv and create a content based recommendation system using TfidfVectorizer and cosine_similarity from sklearn

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to generate TF-IDF matrix in smaller chunks
def generate_tfidf_chunks(data, chunk_size):
    tfidf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
    start = 0
    while start < len(data):
        end = min(start + chunk_size, len(data))
        tfidf_matrix_chunk = tfidf.fit_transform(data[start:end])
        feature_names = tfidf.get_feature_names_out()
        yield tfidf_matrix_chunk, feature_names, start, end
        start = end

# Load movies data
movies = pd.read_csv('movies.csv')

# Compute cosine similarity
def compute_cosine_similarity(data, chunk_size):
    cosine_sim_chunks = []
    row_indices_chunks = []
    feature_names_list = []
    for tfidf_matrix_chunk, chunk_feature_names, start, end in generate_tfidf_chunks(data, chunk_size):
        cosine_sim_chunk = cosine_similarity(tfidf_matrix_chunk)
        cosine_sim_chunks.append(cosine_sim_chunk)
        row_indices_chunks.append((start, end))
        feature_names_list.extend(chunk_feature_names)
    max_len = max(len(chunk) for chunk in cosine_sim_chunks)
    # Adjust row indices for each chunk
    for i, (start, end) in enumerate(row_indices_chunks):
        row_indices_chunks[i] = (start, end, max_len)
    # Merge cosine similarity matrices
    cosine_sim = np.zeros((sum(end - start for start, end, _ in row_indices_chunks), max_len))
    current_row = 0
    for cosine_sim_chunk, (_, _, max_len) in zip(cosine_sim_chunks, row_indices_chunks):
        rows, cols = cosine_sim_chunk.shape
        cosine_sim[current_row:current_row+rows, :cols] = cosine_sim_chunk
        current_row += rows
    return cosine_sim, feature_names_list, row_indices_chunks

cosine_sim, feature_names, row_indices = compute_cosine_similarity(movies['genres'], chunk_size=1000)

# Function to recommend movies based on similarity
def content_recommendations(title, movies_data, cosine_sim=cosine_sim, feature_names=feature_names, row_indices=row_indices):
    idx = movies_data.loc[movies_data['title'] == title].index[0]
    for start, end, _ in row_indices:
        if idx >= start and idx < end:
            adjusted_idx = idx - start
            break
    sim_scores = list(enumerate(cosine_sim[adjusted_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_data['title'].iloc[movie_indices]

# Example usage:
content_recommendations('Toy Story (1995)', movies)


551                Pagemaster, The (1994)
650      James and the Giant Peach (1996)
12                           Balto (1995)
661                      Space Jam (1996)
55         Kids of the Round Table (1995)
621        All Dogs Go to Heaven 2 (1996)
1                          Jumanji (1995)
59     Indian in the Cupboard, The (1995)
124     NeverEnding Story III, The (1994)
986       Escape to Witch Mountain (1975)
Name: title, dtype: object

Parse ratings.csv and create a collaborative filtering based recommendation system using Surprise or Librecommender

In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162992 sha256=2ae6626274be70e488522f6993c3aaa5f2ca215b5648a8bf3d3fe387f763ef27
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

try:
    ratings = pd.read_csv('ratings.csv')
except pd.errors.ParserError as e:
    print(f"ParserError: {e}. Attempting to skip problematic lines.")
    ratings = pd.read_csv('ratings.csv', error_bad_lines=False)

# Convert ratings to numeric, treating non-numeric values as NaN
ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')

# Drop rows with NaN values in rating column
ratings.dropna(subset=['rating'], inplace=True)

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Create the dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)
# Load ratings data
ratings = pd.read_csv('ratings.csv')


# Choose a collaborative filtering algorithm (SVD in this case)
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions
predictions = model.test(testset)

# Evaluate the model
accuracy.rmse(predictions)

# Load movies data
movies = pd.read_csv('movies.csv')

# Merge ratings and movies data on movieId
ratings_with_titles = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')

# Function to get recommendations for a given user
def get_recommendations_for_user(user_id, model, n=10):
    # Get all movie ids
    all_movie_ids = ratings['movieId'].unique()
    # Get movie ids rated by the user
    rated_movie_ids = ratings_with_titles[ratings_with_titles['userId'] == user_id]['movieId']
    # Get movie ids not rated by the user
    unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))
    # Predict ratings for unrated movies
    unrated_movies = [(user_id, movie_id, 3) for movie_id in unrated_movie_ids]
    predictions = model.test(unrated_movies)
    # Sort predictions by estimated rating
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    # Get top n recommended movie ids
    top_n_movie_ids = [pred.iid for pred in sorted_predictions[:n]]
    # Get movie titles corresponding to recommended movie ids
    top_n_movie_titles = movies[movies['movieId'].isin(top_n_movie_ids)]['title']
    return top_n_movie_titles

# Example usage:
user_id = 1  # Replace with the desired user id
recommendations = get_recommendations_for_user(user_id, model)
print(recommendations)

RMSE: 0.8086
1167                            Princess Bride, The (1987)
1190                        Godfather: Part II, The (1974)
2233            Life Is Beautiful (La Vita è bella) (1997)
5508     Spirited Away (Sen to Chihiro no kamikakushi) ...
10896                          Little Miss Sunshine (2006)
12928    Dear Zachary: A Letter to a Son About His Fath...
30506                        Ghost in the Shell 2.0 (2008)
49559                                Blue Planet II (2017)
58237                                    Twin Peaks (1989)
60095                                      Parasite (2019)
Name: title, dtype: object


Task 2: Prediction

Generate 10 recommendations using:
content-based system using the user's historical movie preferences.
collaborative filtering system using user’s ratings

In [4]:
import csv
import pandas as pd
from io import StringIO

# Open the ratings file and skip problematic lines
lines = []
with open('ratings.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for line in reader:
        if len(line) == 4:  # Ensure each line has the expected number of fields
            lines.append(line)

# Create a DataFrame from the remaining lines
ratings_data = pd.DataFrame(lines, columns=['userId', 'movieId', 'rating', 'timestamp'])

# Convert data types if needed
ratings_data['userId'] = ratings_data['userId'].astype(int)
ratings_data['movieId'] = ratings_data['movieId'].astype(int)
ratings_data['rating'] = ratings_data['rating'].astype(float)
ratings_data['timestamp'] = pd.to_datetime(ratings_data['timestamp'], unit='s')


import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movies data
movies_data = pd.read_csv('movies.csv')

from surprise import Dataset, Reader
from surprise import SVD

# Load ratings data
ratings_data = pd.read_csv('ratings.csv')

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Create the dataset
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

# Choose a collaborative filtering algorithm (SVD in this case)
model = SVD()

# Train the model
trainset = data.build_full_trainset()
model.fit(trainset)

# Function to get collaborative filtering recommendations for a given user
def collaborative_filtering_recommendations(user_id, model, movies_data, n=10):
    # Get all movie ids
    all_movie_ids = movies_data['movieId'].unique()
    # Get movie ids rated by the user
    rated_movie_ids = ratings_data[ratings_data['userId'] == user_id]['movieId']
    # Get movie ids not rated by the user
    unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))
    # Predict ratings for unrated movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movie_ids]
    # Sort predictions by estimated rating
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    # Get top n recommended movie ids
    top_n_movie_ids = [pred.iid for pred in sorted_predictions[:n]]
    # Get movie titles corresponding to recommended movie ids
    top_n_movie_titles = movies_data[movies_data['movieId'].isin(top_n_movie_ids)]['title']
    return top_n_movie_titles

# Example usage:
user_id = 3  # Replace with the desired user id
collaborative_filtering_recommendations = collaborative_filtering_recommendations(user_id, model, movies_data)
print("Collaborative Filtering Recommendations:")
print(collaborative_filtering_recommendations)


  ratings_data['timestamp'] = pd.to_datetime(ratings_data['timestamp'], unit='s')


Collaborative Filtering Recommendations:
1                            Jumanji (1995)
2                   Grumpier Old Men (1995)
3                  Waiting to Exhale (1995)
4        Father of the Bride Part II (1995)
5                               Heat (1995)
6                            Sabrina (1995)
7                       Tom and Huck (1995)
8                       Sudden Death (1995)
9                          GoldenEye (1995)
28162               Jesus liebt mich (2012)
Name: title, dtype: object


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Function to generate TF-IDF matrix in smaller chunks
def generate_tfidf_chunks(data, chunk_size):
    tfidf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
    start = 0
    while start < len(data):
        end = min(start + chunk_size, len(data))
        tfidf_matrix_chunk = tfidf.fit_transform(data[start:end])
        feature_names = tfidf.get_feature_names_out()
        yield tfidf_matrix_chunk, feature_names, start, end
        start = end

# Compute cosine similarity
def compute_cosine_similarity(data, chunk_size):
    cosine_sim_chunks = []
    row_indices_chunks = []
    feature_names_list = []
    for tfidf_matrix_chunk, chunk_feature_names, start, end in generate_tfidf_chunks(data, chunk_size):
        cosine_sim_chunk = cosine_similarity(tfidf_matrix_chunk)
        cosine_sim_chunks.append(cosine_sim_chunk)
        row_indices_chunks.append((start, end))
        feature_names_list.extend(chunk_feature_names)
    max_len = max(len(chunk) for chunk in cosine_sim_chunks)
    # Adjust row indices for each chunk
    for i, (start, end) in enumerate(row_indices_chunks):
        row_indices_chunks[i] = (start, end, max_len)
    # Merge cosine similarity matrices
    cosine_sim = np.zeros((sum(end - start for start, end, _ in row_indices_chunks), max_len))
    current_row = 0
    for cosine_sim_chunk, (_, _, max_len) in zip(cosine_sim_chunks, row_indices_chunks):
        rows, cols = cosine_sim_chunk.shape
        cosine_sim[current_row:current_row+rows, :cols] = cosine_sim_chunk
        current_row += rows
    return cosine_sim, feature_names_list, row_indices_chunks

# Function to recommend movies based on similarity
def content_recommendations(title, movies_data, cosine_sim, feature_names, row_indices):
    idx = movies_data.loc[movies_data['title'] == title].index[0]
    for start, end, _ in row_indices:
        if idx >= start and idx < end:
            adjusted_idx = idx - start
            break
    sim_scores = list(enumerate(cosine_sim[adjusted_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_data['title'].iloc[movie_indices]

# Load movies data
movies = pd.read_csv('movies.csv')

# Compute cosine similarity for content-based recommendation
cosine_sim, feature_names, row_indices = compute_cosine_similarity(movies['genres'], chunk_size=1000)

# Load ratings data
ratings = pd.read_csv('ratings.csv')

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Create the dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Choose a collaborative filtering algorithm (SVD in this case)
model = SVD()

# Train the model
model.fit(trainset)

# Load movies data again
movies = pd.read_csv('movies.csv')

# Merge ratings and movies data on movieId
ratings_with_titles = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')

# Function to get recommendations for a given user using collaborative filtering
def get_collab_filtering_recommendations(user_id, model, n=10):
    all_movie_ids = ratings['movieId'].unique()
    rated_movie_ids = ratings_with_titles[ratings_with_titles['userId'] == user_id]['movieId']
    unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))
    unrated_movies = [(user_id, movie_id, 3) for movie_id in unrated_movie_ids]
    predictions = model.test(unrated_movies)
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    top_n_movie_ids = [pred.iid for pred in sorted_predictions[:n]]
    top_n_movie_titles = movies[movies['movieId'].isin(top_n_movie_ids)]['title']
    return top_n_movie_titles

# Function to get recommendations for a given user using content-based filtering
def get_content_based_recommendations(user_ratings, movies_data, cosine_sim, feature_names, row_indices, n=10):
    top_rated_movies = user_ratings[user_ratings['rating'] >= 4]['movieId']
    recommended_movies = []
    for movie_id in top_rated_movies:
        movie_title = movies_data[movies_data['movieId'] == movie_id]['title'].iloc[0]
        recommendations = content_recommendations(movie_title, movies_data, cosine_sim, feature_names, row_indices)
        recommended_movies.extend(recommendations)
    return recommended_movies[:n]

# Example usage:
user_id = 1  # Replace with the desired user id
user_ratings = ratings_with_titles[ratings_with_titles['userId'] == user_id]
content_based_recommendations = get_content_based_recommendations(user_ratings, movies, cosine_sim, feature_names, row_indices)
collab_filtering_recommendations = get_collab_filtering_recommendations(user_id, model)
print("Content-based recommendations:")
print(content_based_recommendations)
print("\nCollaborative filtering recommendations:")
print(collab_filtering_recommendations)


Content-based recommendations:
['Fargo (1996)', 'Get Shorty (1995)', 'Taxi Driver (1976)', 'Amateur (1994)', 'Kiss of Death (1995)', 'Fresh (1994)', 'Guilty as Sin (1993)', 'Killing Zoe (1994)', 'Perfect World, A (1993)', 'Trial by Jury (1994)']

Collaborative filtering recommendations:
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
7                   Tom and Huck (1995)
8                   Sudden Death (1995)
9                      GoldenEye (1995)
Name: title, dtype: object


Task 3: Evaluation

Provide Metrics to judge recommendation systems

Compare and contrast the the two systems

In [37]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd

# Read the ratings.csv file line by line and handle parsing errors manually
ratings_data = []
with open('ratings.csv', 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Try to split the line into fields
            fields = line.strip().split(',')
            if len(fields) == 4:
                # If there are 4 fields, append the data to the ratings_data list
                ratings_data.append(fields)
            else:
                # If there are not 4 fields, skip the line and print a warning
                print(f"Skipped line: {line.strip()}: Expected 4 fields, found {len(fields)}")
        except Exception as e:
            # If there's any other error, skip the line and print the exception
            print(f"Skipped line: {line.strip()}: {e}")

# Convert the ratings_data list to a DataFrame
ratings_data = pd.DataFrame(ratings_data[1:], columns=ratings_data[0])

# Now you can proceed with the rest of your code using the ratings_data DataFrame
import pandas as pd

# Define the file path
file_path = 'ratings.csv'

# Load data with error_bad_lines parameter to skip lines with parsing errors
try:
    ratings_data = pd.read_csv(file_path)
except pd.errors.ParserError as e:
    # Extract the line number from the exception message
    error_message = str(e)
    line_number = error_message.split(':')[0].split()[-1]
    print(f"Skipped line: {line_number}: {e}")
    print(f"Attempting to skip problematic lines.")
    # Initialize an empty DataFrame
    ratings_data = pd.DataFrame()

# Now you can proceed with the rest of your code using the ratings_data DataFrame
import pandas as pd

# Define the file path
file_path = 'ratings.csv'

# Read the file line by line, skipping the problematic line
with open(file_path, 'r') as file:
    lines = file.readlines()
    lines = [line.strip() for line in lines if len(line.split(',')) == 4]

# Write the filtered lines to a new file
filtered_file_path = 'filtered_ratings.csv'
with open(filtered_file_path, 'w') as file:
    file.write('\n'.join(lines))

# Load the filtered data into a DataFrame
ratings_data = pd.read_csv(filtered_file_path)

# Now you can proceed with the rest of your code using the ratings_data DataFrame

# Load data
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')

# Define functions for collaborative filtering
def evaluate_collab_filtering(user_id, model, actual_ratings, movies_data):
    # Get recommended movies for the user
    recommended_movie_ids = model.recommend(user_id)

    # Get actual ratings given by the user
    actual_ratings = actual_ratings[actual_ratings['userId'] == user_id]

    # Filter recommended movies to include only those present in actual ratings
    recommended_movie_ids = [movie_id for movie_id in recommended_movie_ids if movie_id in actual_ratings['movieId'].values]

    # If no common movies found, return None for metrics
    if not recommended_movie_ids:
        print("No common movies between actual ratings and recommendations.")
        return None, None, None

    # Convert actual ratings to binary for evaluation
    actual_ratings['rating'] = actual_ratings['rating'].apply(lambda x: 1 if x >= 3.5 else 0)
    actual_ratings = actual_ratings.set_index('movieId')

    # Get predicted ratings for recommended movies
    predicted_ratings = pd.DataFrame({'movieId': recommended_movie_ids, 'predicted_rating': [1]*len(recommended_movie_ids)})
    predicted_ratings = predicted_ratings.set_index('movieId')

    # Merge actual and predicted ratings
    merged_ratings = actual_ratings.join(predicted_ratings, how='outer').fillna(0)

    # Compute evaluation metrics
    precision = precision_score(merged_ratings['rating'], merged_ratings['predicted_rating'])
    recall = recall_score(merged_ratings['rating'], merged_ratings['predicted_rating'])
    f1 = f1_score(merged_ratings['rating'], merged_ratings['predicted_rating'])

    return precision, recall, f1

# Define functions for content-based filtering
def evaluate_content_based(user_id, actual_ratings, movies_data, cosine_sim, feature_names, row_indices, n=10):
    # Get recommended movies for the user
    recommended_movie_ids = content_based_recommendation(user_id, cosine_sim, feature_names, row_indices, n)

    # Get actual ratings given by the user
    actual_ratings = actual_ratings[actual_ratings['userId'] == user_id]

    # Filter recommended movies to include only those present in actual ratings
    recommended_movie_ids = [movie_id for movie_id in recommended_movie_ids if movie_id in actual_ratings['movieId'].values]

    # If no common movies found, return None for metrics
    if not recommended_movie_ids:
        print("No common movies between actual ratings and recommendations.")
        return None, None, None

    # Convert actual ratings to binary for evaluation
    actual_ratings['rating'] = actual_ratings['rating'].apply(lambda x: 1 if x >= 3.5 else 0)
    actual_ratings = actual_ratings.set_index('movieId')

    # Get predicted ratings for recommended movies
    predicted_ratings = pd.DataFrame({'movieId': recommended_movie_ids, 'predicted_rating': [1]*len(recommended_movie_ids)})
    predicted_ratings = predicted_ratings.set_index('movieId')

    # Merge actual and predicted ratings
    merged_ratings = actual_ratings.join(predicted_ratings, how='outer').fillna(0)

    # Compute evaluation metrics
    precision = precision_score(merged_ratings['rating'], merged_ratings['predicted_rating'])
    recall = recall_score(merged_ratings['rating'], merged_ratings['predicted_rating'])
    f1 = f1_score(merged_ratings['rating'], merged_ratings['predicted_rating'])

    return precision, recall, f1

# Example usage:

# Evaluate collaborative filtering
user_id = 1  # Example user ID
model = CollaborativeFilteringModel()  # Assuming you have a collaborative filtering model
actual_ratings = ratings_data[['userId', 'movieId', 'rating']]  # Assuming ratings_data has columns 'userId', 'movieId', 'rating'
collab_precision, collab_recall, collab_f1 = evaluate_collab_filtering(user_id, model, actual_ratings, movies_data)
print("Collaborative Filtering Metrics:")
print("Precision:", collab_precision)
print("Recall:", collab_recall)
print("F1-score:", collab_f1)

# Evaluate content-based filtering
user_id = 1  # Example user ID
cosine_sim = compute_cosine_similarity(tfidf_matrix)  # Assuming you have computed cosine similarity matrix
feature_names = tfidf_vectorizer.get_feature_names()  # Assuming you have tfidf_vectorizer
row_indices = movies_data['movieId'].values  # Assuming you have row indices for movies in cosine similarity matrix
content_precision, content_recall, content_f1 = evaluate_content_based(user_id, actual_ratings, movies_data, cosine_sim, feature_names, row_indices)
print("\nContent-based Filtering Metrics:")
print("Precision:", content_precision)
print("Recall:", content_recall)
print("F1-score:", content_f1)


Skipped line: 332: Expected 4 fields, found 1
Skipped line: 44101197921: Expected 4 fields, found 1
Skipped line: 1632368879: Expected 4 fields, found 1
Skipped line: 17583,217348,97306,3.5,1497173362: Expected 4 fields, found 5
Skipped line: 25388,3317,4.0,1171217802,3108,5.0,1341360166: Expected 4 fields, found 7
Skipped line: 5,1105433970: Expected 4 fields, found 2
Skipped line: 45511,797,5.0,842.5,1527883146: Expected 4 fields, found 5
Skipped line: error: Error tokenizing data. C error: Expected 4 fields in line 2802413, saw 5

Attempting to skip problematic lines.


ParserError: Error tokenizing data. C error: Expected 4 fields in line 2802413, saw 5


In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Define collaborative filtering recommendation system
def collaborative_filtering(user_id, model, actual_ratings, movies_data):
    all_movie_ids = ratings['movieId'].unique()
    rated_movie_ids = ratings_with_titles[ratings_with_titles['userId'] == user_id]['movieId']
    unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))
    unrated_movies = [(user_id, movie_id, 3) for movie_id in unrated_movie_ids]
    predictions = model.test(unrated_movies)
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    top_n_movie_ids = [pred.iid for pred in sorted_predictions[:n]]
    top_n_movie_titles = movies[movies['movieId'].isin(top_n_movie_ids)]['title']
    pass

# Define content-based filtering recommendation system
def content_based_filtering(user_id, actual_ratings, movies_data, cosine_sim, feature_names, row_indices):
    # Placeholder for content-based filtering implementation
    top_rated_movies = user_ratings[user_ratings['rating'] >= 4]['movieId']
    recommended_movies = []
    for movie_id in top_rated_movies:
        movie_title = movies_data[movies_data['movieId'] == movie_id]['title'].iloc[0]
        recommendations = content_recommendations(movie_title, movies_data, cosine_sim, feature_names, row_indices)
        recommended_movies.extend(recommendations)
    pass

# Define evaluation function for recommendation systems
def evaluate_recommendation_system(actual_ratings, recommended_movies):
    """
    Evaluate a recommendation system using precision, recall, and F1-score.

    Args:
    actual_ratings (list): List of actual ratings given by the user.
    recommended_movies (list): List of recommended movies by the system.

    Returns:
    precision (float): Precision score.
    recall (float): Recall score.
    f1 (float): F1-score.
    """
    # Convert recommended movies to a binary array
    recommended_binary = np.isin(actual_ratings, recommended_movies).astype(int)

    # Precision
    precision = precision_score(actual_ratings, recommended_binary)

    # Recall
    recall = recall_score(actual_ratings, recommended_binary)

    # F1-score
    f1 = f1_score(actual_ratings, recommended_binary)

    return precision, recall, f1

# Load data
movies_data = pd.read_csv('movies.csv')

# Load ratings data
try:
    ratings_data = pd.read_csv('ratings.csv')
except pd.errors.ParserError as e:
    if len(str(e).split(':')) >= 3:
        print(f"Skipped line: {str(e).split(':')[1].strip()}: {str(e).split(':')[2].strip()}")
    else:
        print(f"Skipped line: {str(e)}")
    print(f"Attempting to skip problematic lines.")
    # Initialize an empty DataFrame
    ratings_data = pd.DataFrame()

# Define user ID and actual ratings
user_id = 1
if 'userId' in ratings_data:
    actual_ratings = ratings_data[ratings_data['userId'] == user_id]['movieId'].tolist()
else:
    print("Error: 'userId' column not found in ratings data.")
    actual_ratings = []

# Define collaborative filtering model (example)
model = None

# Evaluate collaborative filtering
collab_precision, collab_recall, collab_f1 = evaluate_recommendation_system(actual_ratings, collaborative_filtering(user_id, model, actual_ratings, movies_data))
print("Collaborative Filtering Metrics:")
print("Precision:", collab_precision)
print("Recall:", collab_recall)
print("F1-score:", collab_f1)

# Define content-based filtering parameters (example)
cosine_sim = None
feature_names = None
row_indices = None

# Evaluate content-based filtering
content_precision, content_recall, content_f1 = evaluate_recommendation_system(actual_ratings, content_based_filtering(user_id, actual_ratings, movies_data, cosine_sim, feature_names, row_indices))
print("\nContent-based Filtering Metrics:")
print("Precision:", content_precision)
print("Recall:", content_recall)
print("F1-score:", content_f1)

# Compare the two recommendation systems
print("\nComparison:")
print("Collaborative Filtering vs Content-based Filtering")
print("Precision:", collab_precision, "vs", content_precision)
print("Recall:", collab_recall, "vs", content_recall)
print("F1-score:", collab_f1, "vs", content_f1)


Skipped line: Error tokenizing data. C error: Expected 4 fields in line 2802413, saw 5

Attempting to skip problematic lines.
Error: 'userId' column not found in ratings data.


AttributeError: 'NoneType' object has no attribute 'test'