In [1]:
from abc import ABC, abstractmethod
import pandas as pd
from datetime import datetime

import spacy
from keybert import KeyBERT
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from rake_nltk import Rake

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.cross_validation import random_train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class KeyWordExtractor(ABC):
    @abstractmethod
    def extract(self, text) -> list[str]:
        pass

class SpacyExtractor(KeyWordExtractor):
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def extract(self, text):
        doc = self.nlp(text)
        return [chunk.text for chunk in doc.noun_chunks]
    
class KeybertExtractor(KeyWordExtractor):
    def __init__(self):
        self.model = KeyBERT('distilbert-base-nli-mean-tokens')

    def extract(self, text):
        return [kw for (kw, _) in self.model.extract_keywords(text)]
    
class GensimExtractor(KeyWordExtractor):
    def extract(self, text):
        texts = [doc.lower().split() for doc in text.strip().split()]
        dct = Dictionary(texts)
        corpus = [dct.doc2bow(line) for line in texts]
        model = TfidfModel(corpus)
        kw = []
        for doc in corpus:
            for (id, score) in model[doc]:
                kw.append(dct[id])
        return kw
    
class RakeExtractor(KeyWordExtractor):
    def __init__(self):
        self.rake = Rake()

    def extract(self, text):
        self.rake.extract_keywords_from_text(text)
        return self.rake.get_ranked_phrases()

In [3]:
ratings = pd.read_json('ratings.jsonl', lines=True)
ratings.rename(columns={col: col[0].lower() + col[1:] for col in ratings.columns}, inplace=True)
content = pd.read_json('content.jsonl', lines=True)
content.rename(columns={col: col[0].lower() + col[1:] for col in content.columns}, inplace=True)

print("Ratings columns: ", ratings.columns)
print("Ratings size: ", len(ratings))
print("Content columns: ", content.columns)
print("Content size: ", len(content))

Ratings columns:  Index(['userId', 'itemId', 'timestamp', 'rating'], dtype='object')
Ratings size:  659720
Content columns:  Index(['itemId', 'title', 'year', 'rated', 'released', 'runtime', 'genre',
       'director', 'writer', 'actors', 'plot', 'language', 'country', 'awards',
       'poster', 'ratings', 'metascore', 'imdbRating', 'imdbVotes', 'type',
       'dVD', 'boxOffice', 'production', 'website', 'response', 'totalSeasons',
       'season', 'episode', 'seriesID'],
      dtype='object')
Content size:  38012


In [4]:
unique_users = ratings['userId'].unique().tolist()
unique_items = content['itemId'].unique().tolist()

print("Number of unique users: ", len(unique_users))
print("Unique users: ", unique_users[:5])
print("Number of unique items: ", len(unique_items))
print("Unique items: ", unique_items[:5])
print("Sparsity: ", len(ratings) / (len(unique_users) * len(unique_items)))

Number of unique users:  51671
Unique users:  ['c4ca4238a0', 'c81e728d9d', 'a87ff679a2', 'e4da3b7fbb', '1679091c5a']
Number of unique items:  38012
Unique items:  ['c9f0f895fb', 'd3d9446802', 'c20ad4d76f', '8e296a067a', '54229abfcf']
Sparsity:  0.00033588612422162087


In [5]:
dataset = Dataset()

dataset.fit(
    users=unique_users,
    items=unique_items,
)

num_users, num_items = dataset.interactions_shape() 
print("Number of users: ", num_users)
print("Number of items: ", num_items)

Number of users:  51671
Number of items:  38012


In [6]:
(interactions, weights) = dataset.build_interactions(
    (row['userId'], row['itemId'], row['rating']) for _, row in ratings.iterrows()
)

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

print("Train interactions: ", repr(train))
print("Test interactions: ", repr(test))

Train interactions:  <51671x38012 sparse matrix of type '<class 'numpy.int32'>'
	with 527776 stored elements in COOrdinate format>
Test interactions:  <51671x38012 sparse matrix of type '<class 'numpy.int32'>'
	with 131944 stored elements in COOrdinate format>


In [7]:
model = LightFM(loss='warp')
model.fit(
    interactions=train,
    epochs=30,
    num_threads=2,
    verbose=True
)

Epoch: 100%|██████████| 30/30 [00:07<00:00,  3.94it/s]


<lightfm.lightfm.LightFM at 0x30df74580>

In [8]:
auc_train = auc_score(model, train).mean()
auc_test = auc_score(model, test, train_interactions=train).mean()
print(f'AUC train: {auc_train}')
print(f'AUC test: {auc_test}')

precision_at_10_train = precision_at_k(model, train, k=10).mean()
precision_at_10_test = precision_at_k(model, test, train_interactions=train, k=10).mean()
print(f'Precision at 10 train: {precision_at_10_train}')
print(f'Precision at 10 test: {precision_at_10_test}')

recall_at_10_train = recall_at_k(model, train, k=10).mean()
recall_at_10_test = recall_at_k(model, test, train_interactions=train, k=10).mean()
print(f'Recall at 10 train: {recall_at_10_train}')
print(f'Recall at 10 test: {recall_at_10_test}')

reciprocal_rank_train = reciprocal_rank(model, train).mean()
reciprocal_rank_test = reciprocal_rank(model, test, train_interactions=train).mean()
print(f'Reciprocal rank train: {reciprocal_rank_train}')
print(f'Reciprocal rank test: {reciprocal_rank_test}')

AUC train: 0.9894445538520813
AUC test: 0.9538678526878357
Precision at 10 train: 0.07502986490726471
Precision at 10 test: 0.04026177152991295
Recall at 10 train: 0.15744960779187292
Recall at 10 test: 0.0947027604226028
Reciprocal rank train: 0.197854682803154
Reciprocal rank test: 0.1224101185798645


In [13]:
targets = pd.read_csv('targets.csv')
target_userIds = targets['UserId'].map(dataset.mapping()[0]).to_numpy()
target_itemIds = targets['ItemId'].map(dataset.mapping()[2]).to_numpy()
targets['Score'] = model.predict(target_userIds, target_itemIds)
targets.sort_values(by=['UserId', 'Score'], ascending=[True, False], inplace=True)
targets[["UserId", "ItemId"]].to_csv(f'./submissions/lightfm2-submission_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False)
targets

Unnamed: 0,UserId,ItemId,Score
44,0006246bee,80d1dae630,2.960773
60,0006246bee,ade4907055,0.064699
59,0006246bee,aad36aac60,-0.253012
50,0006246bee,899610035b,-0.531327
69,0006246bee,c1ee6829f5,-0.566963
...,...,...,...
616146,fffffe98d0,6b2efec875,-6.812146
616126,fffffe98d0,36122c9ac8,-6.816423
616166,fffffe98d0,9cefd28595,-6.934533
616137,fffffe98d0,51d77b425e,-7.037381
