In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from cornac.models import UserKNN
from cornac.eval_methods import RatioSplit
from cornac.metrics import AUC, Recall, Precision
import scipy.sparse as sp

In [3]:
# Load data
def load_data(ratings_path, content_path, targets_path):
    ratings = pd.read_json(ratings_path, lines=True)
    content = pd.read_json(content_path, lines=True)
    targets = pd.read_csv(targets_path)
    return ratings, content, targets

ratings, content, targets = load_data('data/ratings.jsonl', 'data/content.jsonl', 'data/targets.csv')

In [4]:
# Preprocessing content data
content['Genre'] = content['Genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
content['Director'] = content['Director'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
content['Actors'] = content['Actors'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
content['combined_features'] = content['Genre'].apply(lambda x: ' '.join(x)) + ' ' + \
                                content['Director'].apply(lambda x: ' '.join(x)) + ' ' + \
                                content['Actors'].apply(lambda x: ' '.join(x)) + ' ' + \
                                content['Plot'].fillna('')

In [5]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(content['combined_features'])

In [None]:
# Create item-item similarity matrix
def calculate_item_similarity(item_id, tfidf_matrix, content):
    try:
        item_index = content[content['ItemId'] == item_id].index[0]
        item_vector = tfidf_matrix[item_index]
        similarities = cosine_similarity(item_vector, tfidf_matrix).flatten()
        similar_items = pd.Series(similarities, index=content['ItemId'])
        return similar_items
    except IndexError:
        return pd.Series(0, index=content['ItemId']) # Return 0s if item not in content


In [7]:
#Cornac Model
ratings['UserId'] = ratings['UserId'].astype(str)
ratings['ItemId'] = ratings['ItemId'].astype(str)

In [None]:
# Create user-item interaction matrix
uir_df = ratings[['UserId', 'ItemId', 'Rating']]
uir_df = uir_df.pivot_table(index='UserId', columns='ItemId', values='Rating').fillna(0)
uir_matrix = sp.csr_matrix(uir_df.values)

In [None]:
for user_id in targets['UserId'].unique():
    user_targets = targets[targets['UserId'] == user_id]['ItemId'].tolist()
    content_predictions = {}
    for item_id in user_targets:
      similar_items = calculate_item_similarity(item_id, tfidf_matrix, content)
      if not similar_items.empty:
        content_predictions[item_id] = similar_items.mean()
      else:
        content_predictions[item_id] = 0