In [1]:
from datetime import datetime
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset



In [2]:
ratings = pd.read_json("ratings.jsonl", lines=True)
ratings = ratings[["UserId", "ItemId", "Rating"]]
content = pd.read_json("content.jsonl", lines=True)
features_to_combine = ['Title', 'Genre', 'Director', 'Actors', 'Plot']
content['combined'] = content[features_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

print("Ratings columns:", ratings.columns)
print("Content columns:", content.columns)

Ratings columns: Index(['UserId', 'ItemId', 'Rating'], dtype='object')
Content columns: Index(['ItemId', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre',
       'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards',
       'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'totalSeasons',
       'Season', 'Episode', 'seriesID', 'combined'],
      dtype='object')


In [3]:
user_indexes = {user_id: index for index, user_id in enumerate(ratings["UserId"].unique())}
item_indexes = {item_id: index for index, item_id in enumerate(content["ItemId"].unique())}

ratings = ratings.assign(UserIndex=ratings["UserId"].map(user_indexes))
ratings = ratings.assign(ItemIndex=ratings["ItemId"].map(item_indexes))
content = content.assign(ItemIndex=content["ItemId"].map(item_indexes))

In [4]:
dataset = Dataset()
dataset.fit(
    users=ratings["UserIndex"].unique(), 
    items=content["ItemIndex"].unique()
)

dataset.fit_partial(
    item_features=content["combined"].unique()
)

(interactions_matrix, _) = dataset.build_interactions(
    [(row["UserIndex"], row["ItemIndex"], row["Rating"]) for index, row in ratings.iterrows()]
)

item_features = dataset.build_item_features(
    [(row["ItemIndex"], [row["combined"]]) for index, row in content.iterrows()]
)

In [5]:
model = LightFM(loss='warp')

model.fit(interactions_matrix, item_features=item_features, epochs=30, num_threads=8)

<lightfm.lightfm.LightFM at 0x14ccdec80>

In [6]:
targets = pd.read_csv("targets.csv")
targets = targets.assign(UserIndex=targets["UserId"].map(user_indexes))
targets = targets.assign(ItemIndex=targets["ItemId"].map(item_indexes))
targets.head()

Unnamed: 0,UserId,ItemId,UserIndex,ItemIndex
0,0006246bee,01d2404d4c,31471,21530
1,0006246bee,03d43fdf92,31471,5449
2,0006246bee,0808a9666b,31471,24104
3,0006246bee,0a5d7dd6f6,31471,26480
4,0006246bee,0bab4a8104,31471,32061


In [7]:
targets["Score"] = model.predict(targets["UserIndex"].to_numpy(), targets["ItemIndex"].to_numpy(), item_features=item_features, num_threads=8)
targets = targets.sort_values(by=["UserId", "Score"], ascending=[True, False])
targets

Unnamed: 0,UserId,ItemId,UserIndex,ItemIndex,Score
44,0006246bee,80d1dae630,31471,32134,3.268374
50,0006246bee,899610035b,31471,23692,0.222983
60,0006246bee,ade4907055,31471,30153,-0.088180
12,0006246bee,1e5bdbcb76,31471,31410,-0.318094
69,0006246bee,c1ee6829f5,31471,12894,-0.318922
...,...,...,...,...,...
616146,fffffe98d0,6b2efec875,29367,5482,-7.820804
616164,fffffe98d0,91f7cf399b,29367,10000,-7.866508
616126,fffffe98d0,36122c9ac8,29367,11292,-7.867714
616104,fffffe98d0,0a00bbe03b,29367,9053,-7.973168


In [8]:
targets[["UserId", "ItemId"]].to_csv(f'./submissions/submission_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False)