In [9]:
import lightfm
from lightfm.data import Dataset


In [13]:
import pandas as pd
import numpy as np

In [14]:
events = pd.read_csv("events.csv")
items_features_data = pd.read_csv("item_features.csv")
user_features_data = pd.read_csv("user_features.csv").sort_values(by="user_id", axis=0).reset_index(drop=True)



In [15]:
bins = [0, 6, 12, 18, 30, 60]

user_features_data['binned'] = np.searchsorted(bins, user_features_data['age'].values)
# df['binned'] = np.searchsorted(bins, df['percentage'].values)


In [17]:
import lightfm
from lightfm.data import Dataset


In [49]:
all_features = {'1', '2', '3', '4', '5', 'F', 'M'}

In [50]:
dataset = Dataset()
dataset.fit(
    users=user_features_data["user_id"].to_list(),
    items=items_features_data["item_id"].to_list(),
    user_features=list(all_features),
    item_features=items_features_data.columns[1:]
)


In [51]:
v = user_features_data[["user_id", "gender", "binned"]].values
prepared = []
for i in v:
    prepared.append( (i[0], [i[1], i[2]]) )
lightfm_user_features = dataset.build_user_features(prepared)

In [54]:
item_features = []
for x in items_features_data.iloc:
    temp_res = []
    for i, value in enumerate(x[1:]):
        if value != 0:
            temp_res.append(f"genre_{i}")
    item_features.append([x.to_list()[0], temp_res])
lightfm_item_features = dataset.build_item_features(item_features)


In [58]:
(interactions, weights) = dataset.build_interactions(
    [x['user_id'], x['item_id'], x['rating']] for x in events.iloc
)


In [60]:
from lightfm import LightFM

model = LightFM(
    no_components=20,
    loss="warp",
    learning_rate=2e-2,
    learning_schedule='adagrad',
    random_state=42,
    # k=15
)
model.fit(
    interactions=interactions,
    sample_weight=weights,
    item_features=lightfm_item_features,
    user_features=lightfm_user_features,
    verbose=True,
    epochs=100, # ? хз
    num_threads=20,
)


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 100%|██████████| 100/100 [02:09<00:00,  1.29s/it]


<lightfm.lightfm.LightFM at 0x79639821c880>

array([   0,    1,    2, ..., 3703, 3704, 3705])

In [73]:
def _prepare_user_item_pairs(
    cnum_ids: list[int],
    product_ids: list[int],
) -> (list[int], list[int]):
    num_products = len(product_ids)

    users, items = [], []
    for cnum_id in cnum_ids:
        users.extend([cnum_id] * num_products)
        items.extend(product_ids)

    return users, items


In [83]:
is_in_interaction = set()
for i in events.values:
    is_in_interaction.add((i[0], i[1]))

In [85]:
user_id_for_predict = []
item_id_for_predict = []
for i in user_features_data.user_id.values:
    for j in items_features_data.item_id.values:
        if (i, j) in is_in_interaction:
            continue
        user_id_for_predict.append(i)
        item_id_for_predict.append(j)

In [89]:
predictions = model.predict(user_id_for_predict,
                            item_id_for_predict,
                            user_features=lightfm_user_features,
                            item_features=lightfm_item_features)


In [136]:
# from collections import defaultdict

values = defaultdict(list)
for rating, us, it in zip(predictions, user_id_for_predict, item_id_for_predict):
    values[us].append( (it, rating) )

new_values = dict()
for i in user_id_for_predict:
    values[i] = sorted(values[i], key=lambda x : -x[1])
    values[i] = values[i][:10]
    new_values[i] = " ".join([str(j[0]) for j in values[i]]) 


In [139]:
df_ans = pd.DataFrame()

df_ans['user_id'] = list(new_values.keys())
df_ans['item_id'] = list(new_values.values())

df_ans.set_index("user_id").to_csv("x.csv")