# Text-processing of metadata and creating a content-based recommender system based on Word2Vec

In [36]:
import pandas as pd
test = pd.read_table('test_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])
train = pd.read_table('train_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])

In [37]:
!python clean_metadata.py

In [38]:
metadata_og = pd.read_table('metadata_cleaned.tsv', sep='\t', header=0)


In [39]:
metadata_og

Unnamed: 0,item_id,title,description
0,B00508JFE4,Behringer EUROPOWER EPQ900 Professional 900 Wa...,BEHRINGER EUROPOWER EPQ900. Professional 900-W...
1,B0B89ZSYS7,Shure SM7B Vocal Dynamic Microphone for Broadc...,"The SM7B dynamic microphone has a smooth, flat..."
2,B01HY530RS,StewMac Guitar Tech Wrench Set,6 sizes guitar techs use all the time. These t...
3,B006O64JMY,PylePro Full Size Electric Guitar Package w/ A...,Product Description. Get rocking with this beg...
4,B00502CTPW,3 Mini Color Violin Fingering Tape for Fretboa...,"Finally, no more ugly masking tape!These plast..."
...,...,...,...
23979,B071RRDXPM,Dean EVO XM Solid Body Electric Guitar - Satin...,Dean's EVO XM is a perfect blend of vintage de...
23980,B0CF1WHZRF,NUX Mighty Lite BT MKII Portable Desktop Model...,
23981,B0B422HMZG,"LAMSAM Fully Loaded Guitar Control Plate, Prew...",
23982,B06XJDTC2N,Fender American Series Stratocaster Guitar Tun...,


In [4]:
metadata_og[metadata_og['item_id'] == 'B0002D01KO']

Unnamed: 0,item_id,title,description
5621,B0002D01KO,Epiphone Gigbag for Solidbody Electric Guitars,Gigbag Epiphone Solidbody Electric


In [None]:
metadata = pd.read_table('metadata_cleaned.tsv', sep='\t', header=0)

## Preprocessing of the description column

In [None]:
import string

metadata = metadata.sort_values(['item_id', 'title', 'description'])
metadata['description'] = metadata['description'].fillna("".strip())


In [7]:
description = metadata['description']

In [8]:
from nltk.tokenize import word_tokenize
import math

description_before_pre_processing = description.copy()

vocab = set()
for s in description_before_pre_processing:
    words = word_tokenize(s, language="english")
    vocab.update(words)
print("Total number of words in description:", len(vocab))

Total number of words in description: 74053


In [3]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('s')

's'

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


metadata['description'] = metadata['description'].str.lower() # lowercasing 
metadata['description'] = metadata['description'].apply(word_tokenize, language="english") #tokenizing
english_stopwords = stopwords.words('english') + [char for char in string.punctuation]
metadata['description'] = metadata['description'].apply(
    lambda tokens: [token for token in tokens if token not in english_stopwords]
) 

metadata['description_stemmed'] = metadata['description'].apply(lambda x: [stemmer.stem(elem) for elem in x])


In [10]:
new_vocab = set()
metadata['description_stemmed'].apply(
    lambda tokens: new_vocab.update(set(tokens)))
len(new_vocab)

52688

In [11]:
new_vocab = set()
metadata['description'].apply(
    lambda tokens: new_vocab.update(set(tokens)))
len(new_vocab)

61332

## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(tokens):
    return tokens

tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, preprocessor=None)

tfidf_matrix = tfidf_vectorizer.fit_transform(metadata['description_stemmed'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=metadata['item_id']
)



In [13]:
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
import numpy as np

def get_item_embedding(tokens, embeddings, embedding_size=300) -> float:
    # we get embeddings for each word in the description
    word_vectors = [embeddings[word] for word in tokens if word in embeddings]
    if len(word_vectors) == 0: # non-trained word
        return np.zeros(embedding_size) 
    # avg the word embeddings
    return np.mean(word_vectors, axis=0)


# Compute item embeddings using Word2Vec
metadata['description_embedded'] = (metadata['description'].apply(" ".join).apply(
    lambda tokens: get_item_embedding(tokens, word2vec_vectors)
))

## Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

tfidf_cosine_sim = cosine_similarity(tfidf_matrix)

tfidf_cosine_sim_df = pd.DataFrame(
    tfidf_cosine_sim,
    index=metadata['item_id'],
    columns=metadata['item_id']
)

In [16]:
print("TF-IDF Cosine Similarity:")
ref_items = ['B0002D01KO', 'B0002D0CCQ']
tfidf_cosine_sim_df[tfidf_cosine_sim_df.index.isin(ref_items)][ref_items]

TF-IDF Cosine Similarity:


item_id,B0002D01KO,B0002D0CCQ
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0002D01KO,1.0,0.0
B0002D0CCQ,0.0,1.0


In [17]:
np.shape(tfidf_matrix)

(23984, 52688)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

item_embeddings = np.vstack(metadata['description_embedded'].values)

embedding_cosine_sim = cosine_similarity(item_embeddings)

embedding_cosine_sim_df = pd.DataFrame(
    embedding_cosine_sim,
    index=metadata['item_id'],
    columns=metadata['item_id']
)
embedding_cosine_sim_df[embedding_cosine_sim_df.index.isin(ref_items)][ref_items]

item_id,B0002D01KO,B0002D0CCQ
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0002D01KO,1.0,0.977438
B0002D0CCQ,0.977438,1.0


In [19]:
print('Word2Vec similarity:')
embedding_cosine_sim_df[embedding_cosine_sim_df.index.isin(ref_items)][ref_items]

Word2Vec similarity:


item_id,B0002D01KO,B0002D0CCQ
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0002D01KO,1.0,0.977438
B0002D0CCQ,0.977438,1.0


# Week 10

## Preprocessing of the Title column

In [20]:
metadata['title'] = metadata['title'].fillna("".strip())

In [None]:
from nltk.tokenize import word_tokenize

title = metadata['title']

titles_before_preprocessing = title.copy()

vocab = set()
for s in titles_before_preprocessing:
    #words = str.split(" ")
    words = word_tokenize(s, language="english")
    vocab.update(words)
print("Total number of words in title:", len(vocab))

Total number of words in title: 33193


In [22]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
stemmer = PorterStemmer()


metadata['title'] = metadata['title'].str.lower() # lowercasing 
metadata['title'] = metadata['title'].apply(word_tokenize, language="english") #tokenizing
english_stopwords = stopwords.words('english') + [char for char in string.punctuation]
metadata['title'] = metadata['title'].apply(
    lambda tokens: [token for token in tokens if token not in english_stopwords]
) # remove stopwords

metadata['title_stemmed'] = metadata['title'].apply(lambda x: [stemmer.stem(elem) for elem in x])

In [23]:
new_vocab = set()
metadata['title_stemmed'].apply(
    lambda tokens: new_vocab.update(set(tokens)))
len(new_vocab)

27962

In [24]:
new_vocab = set()
metadata['title'].apply(
    lambda tokens: new_vocab.update(set(tokens)))
len(new_vocab)

29987

In [None]:
# Word2Vec
metadata['title_embedded'] = (metadata['title'].apply(" ".join).apply(
    lambda tokens: get_item_embedding(tokens, word2vec_vectors)
))

In [26]:
relevant_test = test[test['rating'].apply(lambda x: x >= 3)]

In [27]:
def get_top_recommendations(user_id : str, sim_user_item_df : pd.DataFrame, k=10):
    similarities = sim_user_item_df[user_id].sort_values(ascending=False, kind='stable')
    return similarities.index[:k]

def compute_precision_at_k(top_recs, ground_truth, k):
    top_recs = top_recs[:k]
    ground_truth = ground_truth
    hits = len(set(top_recs) & set(ground_truth))
    return hits / k

def compute_ap(top_recs, ground_truth):
    ground_truth_set = set(ground_truth)    
    G = len(ground_truth_set)
    if G == 0:
        return 0.0
    cumulative_precision = 0.0
    relevant_count = 0
    for rank, item in enumerate(top_recs, 1):
        if item in ground_truth_set:
            relevant_count += 1
            precision_at_k = relevant_count / rank  # P@k
            cumulative_precision += precision_at_k  # Sum of P@k for relevant ranks
    ap = cumulative_precision / G
    return ap
def compute_metrics(user_ids, sim_user_item, k=10):
    hit_rates = []
    precision_scores = []
    aps = []
    rrs = []
    total_items_in_catalog = len(set(train['item_id']).union(set(test['item_id'])))
    recommended_items = set()
    for user_id in user_ids:
        hit = False
        top_recs = get_top_recommendations(user_id, sim_user_item, k)
        recommended_items.update(list(top_recs))
        ground_truth = list(relevant_test[relevant_test['user_id'] == user_id]['item_id'])
        hits = len(set(top_recs) & set(ground_truth))
        hit = (hits > 0).real
        precision_at_k = compute_precision_at_k(top_recs, ground_truth, k)    
        ap = compute_ap(top_recs, ground_truth)

        for rank, item in enumerate(top_recs, 1):
            if item in ground_truth:
                rr = 1 / rank
                break
            else:
                rr = 0
        
        coverage =  len(recommended_items) / total_items_in_catalog
        hit_rates.append(hit)
        precision_scores.append(precision_at_k)
        aps.append(ap)
        rrs.append(rr)
    return {'PRECISION@k:': round(np.mean(precision_scores), 3), 'MAP@k:': round(np.mean(aps), 3), 'MRR@k:': round(np.mean(rrs), 3), 'Hit rate': round(np.mean(hit_rates), 3), 'Coverage': round(coverage, 3)}

In [28]:
metadata_new = metadata[metadata['item_id'].isin(set(train['item_id']))]

# Word2Vec

In [29]:
title_features = np.array(metadata_new['title_embedded'].tolist())
description_features = np.array(metadata_new['description_embedded'].tolist())

title_features_dict = {item_id: feature_vector for item_id, feature_vector in zip(metadata_new['item_id'], metadata_new['title_embedded'])}
description_features_dict = {item_id: feature_vector for item_id, feature_vector in zip(metadata_new['item_id'], metadata_new['description_embedded'])}

user_profiles = pd.DataFrame(index=train['user_id'].unique())
user_profiles['title_profile'] = None  
user_profiles['description_profile'] = None  

user_profiles['title_profile'] = user_profiles['title_profile'].astype(object)
user_profiles['description_profile'] = user_profiles['description_profile'].astype(object)

for user_id, group in train.groupby('user_id'):

    title_vector = np.average([title_features_dict.get(key) for key in group['item_id']], axis=0, weights=group['rating'])
    description_vector = np.average([description_features_dict.get(key) for key in group['item_id']], axis=0, weights=group['rating'])
    
    user_profiles.at[user_id, 'title_profile'] = title_vector.tolist()  
    user_profiles.at[user_id, 'description_profile'] = description_vector.tolist()
title_sim = cosine_similarity(title_features, list(user_profiles['title_profile']))
title_sim_df = pd.DataFrame(title_sim, index=metadata_new['item_id'], columns=user_profiles.index)

description_sim = cosine_similarity(description_features, list(user_profiles['description_profile']))
description_sim_df = pd.DataFrame(description_sim, index=metadata_new['item_id'], columns=user_profiles.index)


#alphas = [0, 1/3, 2/3, 1]
for alpha in [2/3]:
    combined_sim = alpha * title_sim_df + (1 - alpha) * description_sim_df 
    metrics = compute_metrics(relevant_test['user_id'].unique(), sim_user_item=combined_sim, k=10)
    print(metrics)

{'PRECISION@k:': 0.011, 'MAP@k:': 0.008, 'MRR@k:': 0.035, 'Hit rate': 0.096, 'Coverage': 0.4}


In [30]:
combined_sim_dict = combined_sim.to_dict()

In [31]:
combined_sim_dict_sorted = {
    user: dict(sorted(items.items(), key=lambda x: x[1], reverse=True))
    for user, items in combined_sim_dict.items()
}


In [32]:
import json
with open('collab_rankings.json', 'w') as fp:
    json.dump(combined_sim_dict_sorted, fp)

In [33]:
import numpy as np
import pandas as pd

def normalize_to_rating_range(similarity_scores):
    normalized_scores = 1 + (similarity_scores - 0) * (4 / (1 - 0)) 
    return normalized_scores

normalized_ratings = normalize_to_rating_range(combined_sim)

In [34]:
# We have to sort in this way to keep the sorting of identical ratings from the original models DataFrame.
normalized_ratings_dict = normalized_ratings.to_dict()

combined_sim_df = pd.DataFrame(normalized_ratings_dict)

normalized_ratings_sorted = {}
for user in combined_sim_df.columns:
    user_ratings = combined_sim_df[user]
    
    sorted_items = user_ratings.sort_values(ascending=False, kind='stable').index.tolist()
    
    normalized_ratings_sorted[user] = {item: user_ratings[item] for item in sorted_items}

In [35]:
import json
with open('content_rankings_with_rating.json', 'w') as fp:
    json.dump(normalized_ratings_sorted, fp)