# Section 6
## Implementation of the Parallel Hybrid and the LLM-Content-Based system

In [132]:
from typing import Dict, List
import numpy as np
import pandas as pd

In [133]:
test = pd.read_table('test_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])
train = pd.read_table('train_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])

In [146]:
metadata = pd.read_table('metadata_cleaned.tsv', sep='\t', header=0)
metadata = metadata[metadata['item_id'].isin(train['item_id'])]

# Parallel combination strategy

In [147]:
import json
with open('knn_user_rankings_with_ratings.json', 'r') as fp:
    knn_user_rankings_with_rating = json.load(fp)

In [148]:
with open('content_rankings_with_rating.json', 'r') as fp:
    content_rankings_with_rating = json.load(fp)


In [None]:
relevant_test = test[test['rating'] >= 3]


def compute_precision_at_k(top_recs, ground_truth, k):
    top_recs = top_recs[:k]
    ground_truth = ground_truth
    hits = len(set(top_recs) & set(ground_truth))
    return hits / k

def compute_ap(top_recs, ground_truth):
    ground_truth_set = set(ground_truth)    
    G = len(ground_truth_set)
    if G == 0:
        return 0.0
    cumulative_precision = 0.0
    relevant_count = 0
    for rank, item in enumerate(top_recs, 1):
        if item in ground_truth_set:
            relevant_count += 1
            precision_at_k = relevant_count / rank  # P@k
            cumulative_precision += precision_at_k  # Sum of P@k for relevant ranks
    ap = cumulative_precision / G
    return ap
def compute_metrics(user_ids, top_recommendations, k=10):
    hit_rates = []
    precision_scores = []
    aps = []
    rrs = []
    total_items_in_catalog = len(set(train['item_id']).union(set(test['item_id'])))
    recommended_items = set()
    for user_id in user_ids:
        hit = False
        top_recs = top_recommendations.get(user_id)[:k]
        recommended_items.update(list(top_recs))
        ground_truth = list(relevant_test[relevant_test['user_id'] == user_id]['item_id'])
        hits = len(set(top_recs) & set(ground_truth))
        hit = (hits > 0).real
        precision_at_k = compute_precision_at_k(top_recs, ground_truth, k=10)    
        ap = compute_ap(top_recs, ground_truth)

        for rank, item in enumerate(top_recs, 1):
            if item in ground_truth:
                rr = 1 / rank
                break
            else:
                rr = 0
        
        coverage =  len(recommended_items) / total_items_in_catalog
        hit_rates.append(hit)
        precision_scores.append(precision_at_k)
        aps.append(ap)
        rrs.append(rr)
    return {'PRECISION@k:': round(np.mean(precision_scores), 3), 'MAP@k:': round(np.mean(aps), 3), 'MRR@k:': round(np.mean(rrs), 3), 'Hit rate': round(np.mean(hit_rates), 3), 'Coverage': round(coverage, 3)}

In [None]:
model1_preds = knn_user_rankings_with_rating  
model2_preds = content_rankings_with_rating   

alpha = 1/3 # knn
beta = 2/3 # content

final_predictions_with_ratings = {}
final_predictions = {}

for user in set(model1_preds.keys()).union(set(model2_preds.keys())):
    combined_preds = {}
    
    for item, pred in model1_preds.get(user, []):
        combined_preds[item] = alpha * pred 
    
    for item, pred in model2_preds.get(user, {}).items():
        if item in combined_preds:
            combined_preds[item] += beta * pred  
        else:
            combined_preds[item] = beta * pred
    
    sorted_items = sorted(combined_preds.items(), key=lambda x: x[1], reverse=True)
    
    final_predictions_with_ratings[user] = dict(sorted_items)
    final_predictions[user] = [item for item, _ in sorted_items]
compute_metrics(relevant_test['user_id'].unique(), final_predictions, k=10)

{'PRECISION@k:': 0.011,
 'MAP@k:': 0.007,
 'MRR@k:': 0.024,
 'Hit rate': 0.104,
 'Coverage': 0.577}

# LLM

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import pipeline, AutoTokenizer

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="cuda",
    
)

In [None]:
import pandas as pd

descriptions = []  

for title in metadata['title']:
    output = pipe(
        f"Generate a detailed and accurate description for the following musical instrument: {title}\n",
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        max_new_tokens=50
        repetition_penalty=1.2,
        num_return_sequences=1
    )
    
    generated_text = output[0]['generated_text']
    prompt = f"Generate a detailed and accurate description for the following musical instrument: {title}\n"
    description = generated_text.replace(prompt, "").strip()
    
    descriptions.append(description)

metadata['llama-descriptions'] = descriptions

In [None]:
'''
import json
import pandas as pd
with open('metadata_llama.json') as f: # saved from previous cell
    metadata_loaded = json.load(f)

metadata_loaded = json.loads(metadata_loaded)


data = metadata_loaded['data']

columns = metadata_loaded['columns']

metadata_loaded = pd.DataFrame(data, columns=columns)

metadata = metadata_loaded
'''

In [None]:
metadata = metadata[metadata['item_id'].isin(train['item_id'])] # double checking

In [None]:
from nltk.tokenize import word_tokenize
import nltk
description_before_pre_processing = metadata['llama-descriptions']

vocab = set()
for s in description_before_pre_processing:
    #words = str.split(" ")
    words = word_tokenize(s, language="english")
    vocab.update(words)
print("Total number of words in description:", len(vocab))

Total number of words in description: 5262


In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


metadata['llama-descriptions'] = metadata['llama-descriptions'].str.lower() # lowercasing 
metadata['llama-descriptions'] = metadata['llama-descriptions'].apply(word_tokenize, language="english") #tokenizing
english_stopwords = stopwords.words('english') + [char for char in string.punctuation]
metadata['llama-descriptions'] = metadata['llama-descriptions'].apply(
    lambda tokens: [token for token in tokens if token not in english_stopwords]
) 

metadata['llama-stemmed'] = metadata['llama-descriptions'].apply(lambda x: [stemmer.stem(elem) for elem in x])

In [226]:
new_vocab = set()
metadata['llama-descriptions'].apply(
    lambda tokens: new_vocab.update(set(tokens)))
len(new_vocab)

4509

In [181]:
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

In [227]:
def get_item_embedding(tokens, embeddings, embedding_size=300) -> float:
    # Get embeddings for each word in the description
    word_vectors = [embeddings[word] for word in tokens if word in embeddings]
    if len(word_vectors) == 0:
        return np.zeros(embedding_size)  # Return a zero vector of the correct size
    return np.mean(word_vectors, axis=0)

metadata['description_embedded'] = (metadata['llama-descriptions'].apply(" ".join).apply(
    lambda tokens: get_item_embedding(tokens, word2vec_vectors)
))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['description_embedded'] = (metadata['llama-descriptions'].apply(" ".join).apply(


In [228]:
def get_top_recommendations(user_id : str, sim_user_item_df : pd.DataFrame, k=10):
    similarities = sim_user_item_df[user_id].sort_values(ascending=False)
    return similarities.index[:k]

def compute_precision_at_k(top_recs, ground_truth, k):
    top_recs = top_recs[:k]
    ground_truth = ground_truth
    hits = len(set(top_recs) & set(ground_truth))
    return hits / k

def compute_ap(top_recs, ground_truth):
    ground_truth_set = set(ground_truth)    
    G = len(ground_truth_set)
    if G == 0:
        return 0.0
    cumulative_precision = 0.0
    relevant_count = 0
    for rank, item in enumerate(top_recs, 1):
        if item in ground_truth_set:
            relevant_count += 1
            precision_at_k = relevant_count / rank  # P@k
            cumulative_precision += precision_at_k  # Sum of P@k for relevant ranks
    ap = cumulative_precision / G
    return ap
def compute_metrics(user_ids, sim_user_item, k=10):
    hit_rates = []
    precision_scores = []
    aps = []
    rrs = []
    total_items_in_catalog = len(set(train['item_id']).union(set(test['item_id'])))
    recommended_items = set()
    for user_id in user_ids:
        hit = False
        top_recs = get_top_recommendations(user_id, sim_user_item, k)
        recommended_items.update(list(top_recs))
        ground_truth = list(relevant_test[relevant_test['user_id'] == user_id]['item_id'])
        hits = len(set(top_recs) & set(ground_truth))
        hit = (hits > 0).real
        precision_at_k = compute_precision_at_k(top_recs, ground_truth, k)    
        ap = compute_ap(top_recs, ground_truth)

        for rank, item in enumerate(top_recs, 1):
            if item in ground_truth:
                rr = 1 / rank
                break
            else:
                rr = 0
        
        coverage =  len(recommended_items) / total_items_in_catalog
        hit_rates.append(hit)
        precision_scores.append(precision_at_k)
        aps.append(ap)
        rrs.append(rr)
    return {'PRECISION@k:': round(np.mean(precision_scores), 3), 'MAP@k:': round(np.mean(aps), 3), 'MRR@k:': round(np.mean(rrs), 3), 'Hit rate': round(np.mean(hit_rates), 3), 'Coverage': round(coverage, 3)}

In [229]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

feature = np.array(metadata['description_embedded'].tolist()) 

feature_dict = {item_id: feature_vector for item_id, feature_vector in zip(metadata['item_id'], metadata['description_embedded'])}

user_profiles = pd.DataFrame(index=train['user_id'].unique())
user_profiles['profile'] = None  

user_profiles['profile'] = user_profiles['profile'].astype(object)

for user_id, group in train.groupby('user_id'):
    feature_vector = np.average([feature_dict.get(key) for key in group['item_id']], axis=0, weights=group['rating'])
    user_profiles.at[user_id, 'profile'] = feature_vector.tolist()

user_profiles_matrix = np.array(user_profiles['profile'].tolist())  

llama_sim = cosine_similarity(feature, user_profiles_matrix)  

llama_sim_df = pd.DataFrame(llama_sim, index=metadata['item_id'], columns=user_profiles.index)

metrics = compute_metrics(relevant_test['user_id'].unique(), sim_user_item=llama_sim_df, k=10)

In [230]:
metrics

{'PRECISION@k:': 0.011,
 'MAP@k:': 0.011,
 'MRR@k:': 0.037,
 'Hit rate': 0.106,
 'Coverage': 0.403}

In [263]:
metadata['item_id'][4]

'B007MY5BDI'

In [258]:
metadata['llama-descriptions'][4]

'The strings on this set are an improvement over any other I\'ve used before. They give off some of that bright tone you get from lighter gauge strings but with more volume as well.\nThese are my new favorite string! The sound is so much better than anything else out there (and yes they do come in all gauges)!! You will not be disappointed!\nI really like these strings because they have great projection when playing at high volumes, also they last longer then most others which means less changes to your budget each time you change them. Great job by daddio!\nI was going to say "This sounds like something I would buy" But it\'s actually good enough that it can\'t hurt anyone else either :'

In [262]:
metadata['description'][4]

"80/20 Bronze are our brightest acoustic strings, made to give your guitar an unparalleled shine. These bold sounding strings were developed by John D'Addario Sr. and guitar maker John D'Angelico in the 1930s, and have been beloved for their sparkling, trebly tone ever since. Made with a high carbon steel core and 80/20 Bronze wrap wire, 80/20 acoustic strings offer remarkable depth, along with rich, bright harmonics, and powerful projection. These 10-47 Extra Light gauge strings are easy to play and easy to bend."