In [3]:
!pip install Lightfm



In [4]:
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack, csr_matrix
import json

drive.mount('/content/drive')


Mounted at /content/drive


In [5]:

# ----------------------
# 1) Wczytanie i filtrowanie (jak wcześniej)
# ----------------------

# Wczytujemy dane i usuwamy puste pola (dropna) i duplikaty, żeby nie trenować na powtórkach tej samej interakcji.

df = pd.read_csv(
    "/content/drive/MyDrive/product-recommendation-challenge/train.csv",
    parse_dates=["timestamp"]
).dropna().drop_duplicates()
df_test = pd.read_csv(
    "/content/drive/MyDrive/product-recommendation-challenge/test.csv"
).dropna().drop_duplicates()


# Liczymy, ile razy każdy użytkownik i każdy przedmiot pojawił się w danych.
#uc = df["user_id"].value_counts()
#ic = df["item_id"].value_counts()

# Zostawiamy tylko tych użytkowników i tylko te produkty, które występują co najmniej 2-krotnie – to gwarantuje,
#  że każdy user i każdy item ma szansę pojawić się i w treningu, i (potencjalnie) w teście.
#mask = df["user_id"].isin(uc[uc>=2].index) & df["item_id"].isin(ic[ic>=2].index)
#df_filtered = df[mask].reset_index(drop=True)


# ----------------------
# 2) Temporalny split per-user (jak miałeś)
# ----------------------

# Konwertujemy timestamp na obiekt daty i czasu, sortujemy wszystkie rekordy rosnąco po czasie
#df_filtered["timestamp"] = pd.to_datetime(df_filtered["timestamp"], unit="ms")
#df_filtered = df_filtered.sort_values("timestamp").reset_index(drop=True)


df_items = pd.read_csv("/content/drive/MyDrive/product-recommendation-challenge/item_metadata_filtered.csv", on_bad_lines='warn')
with open("/content/drive/MyDrive/product-recommendation-challenge/id_mappings.json", "r") as f:
    id_mappings = json.load(f)

item_map = id_mappings['item_mapping']              # asin → item_id (int)
item_reverse_map = id_mappings['item_reverse_mapping']  # index → asin (str)
df_items['item_id'] = df_items['parent_asin'].map(item_map)

df = df.merge(df_items, how='inner', on='item_id')
#df_test = df_test.merge(df_items, how='inner', on='item_id')


  df = pd.read_csv(


In [6]:
print(df_test)

        user_id  predictions
0             0            0
1             1            0
2             3            0
3             5            0
4             6            0
...         ...          ...
412456   868205            0
412457   868208            0
412458   868209            0
412459   868210            0
412460   868217            0

[412461 rows x 2 columns]


In [7]:
print(item_reverse_map)
print(item_map)
print(df_items['parent_asin'].head(3))


{'0': '0007931727', '1': '0321700945', '2': '0321719816', '3': '0321719824', '4': '0321898338', '5': '0321898362', '6': '0395969271', '7': '043945669X', '8': '0439801494', '9': '0545090725', '10': '0615179088', '11': '069267599X', '12': '0740328980', '13': '0801002699', '14': '0816091846', '15': '0899335969', '16': '095545994X', '17': '0955909619', '18': '0971275238', '19': '0974708038', '20': '0976963027', '21': '0981766005', '22': '0982583885', '23': '0982697813', '24': '0989614026', '25': '1259319636', '26': '140417429X', '27': '1413309674', '28': '1426296355', '29': '1453085815', '30': '1465874399', '31': '1564270491', '32': '1572644672', '33': '158298302X', '34': '1592970362', '35': '1607178990', '36': '1607179202', '37': '1607179245', '38': '160717930X', '39': '1607179393', '40': '160717944X', '41': '1608292983', '42': '1608293017', '43': '1608297128', '44': '1608297160', '45': '1608297187', '46': '1608299627', '47': '1608299740', '48': '1608299953', '49': '161535316X', '50': '16

In [8]:
df.drop(columns=[ 'parent_asin'], inplace=True)

In [9]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'main_category', 'title',
       'average_rating', 'rating_number', 'price', 'store', 'features',
       'description', 'images', 'categories', 'image_count', 'has_images',
       'image_urls', 'category'],
      dtype='object')

In [10]:

cat_cols = ['main_category', 'store', 'category']
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
cat_feats = df[cat_cols]

In [11]:
df['rating_number'] = np.log1p(df['rating_number'].fillna(0))
df['price'] = np.log1p(df['price'].fillna(0))

In [12]:
from sklearn.preprocessing import StandardScaler

num_cols = ['average_rating', 'rating_number', 'price']
scaler = StandardScaler()
num_feats = scaler.fit_transform(df[num_cols])

In [13]:
df["has_images"] = df["has_images"].astype(int)
bool_feats = df[["has_images"]].values

In [14]:
# ----------------------
# 4) Budowa sparse–macierzy
# ----------------------

dataset = Dataset()
dataset.fit(df["user_id"], df["item_id"])

train_tuples = list(zip(df["user_id"], df["item_id"], df["rating"]))
train_interactions, train_weights = dataset.build_interactions(train_tuples)

test_tuples = list(df_test["user_id"])
test_interactions, _ = dataset.build_interactions(((u, 0, 1.0) for u in test_tuples))

print("Train matrix:", train_interactions.shape, " nnz =", train_interactions.getnnz())
print("Test  matrix:", test_interactions.shape,  " nnz =", test_interactions.getnnz())


Train matrix: (868218, 76747)  nnz = 2543146
Test  matrix: (868218, 76747)  nnz = 412461


In [15]:
print("Shape:", train_interactions.shape)
print("Non-zeros:", train_interactions.nnz)
print("Row indices:", train_interactions.row)
print("Col indices:", train_interactions.col)
print("Data (ratings):", train_interactions.data)


Shape: (868218, 76747)
Non-zeros: 2543146
Row indices: [     0      0      0 ... 868217 868217 868217]
Col indices: [  0   1   2 ... 689 137 259]
Data (ratings): [1 1 1 ... 1 1 1]


In [16]:
#df['iidx'] = df['item_id'].map(item2idx).astype(int)

#df = df.sort_values("iidx").reset_index(drop=True)
df.drop(columns=['user_id', 'item_id'], inplace=True)

item_features_sparse = hstack([
    csr_matrix(num_feats),
    csr_matrix(bool_feats),
    cat_feats
]).tocsr()

In [17]:
print("Shape:", train_interactions.shape)
print("Non-zeros:", train_interactions.nnz)
print("Row indices:", train_interactions.row)
print("Col indices:", train_interactions.col)
print("Data (ratings):", train_interactions.data)


Shape: (868218, 76747)
Non-zeros: 2543146
Row indices: [     0      0      0 ... 868217 868217 868217]
Col indices: [  0   1   2 ... 689 137 259]
Data (ratings): [1 1 1 ... 1 1 1]


In [None]:
import numpy as np
import pandas as pd
from lightfm import LightFM

# --- 1) Trenujemy LightFM na binarnej macierzy interakcji ---
model = LightFM(
    loss="warp",
    no_components=128,       # zwiększona liczba komponentów
    learning_rate=0.01,     # obniżony learning rate
    item_alpha=1e-4,         # silniejsza regularizacja cech
    user_alpha=1e-5,         # lekka regularyzacja userów
    random_state=42
)

# === 3. Trening z ręcznym śledzeniem epok ===
n_epochs = 50
model.fit(
    train_interactions,
    item_features=item_features_sparse,
    epochs=n_epochs,
    num_threads=8,
    verbose=True
)

Epoch:  52%|█████▏    | 26/50 [21:31<19:51, 49.63s/it]

Ewaluacja

In [None]:
# --- Parametry ---
n_eval_users = 1000
K            = 10


# --- 2) Precompute embeddingi ---
item_emb = item_features_sparse.dot(model.item_embeddings)
user_emb = model.user_embeddings

# --- 3)  ground‐truth z hold‐outu ---
coo       = test_interactions.tocoo()
val       = pd.DataFrame({"u": coo.row, "i": coo.col})
true_dict = val.groupby("u")["i"].apply(set).to_dict()


# --- 4) Funkcja AP@K ---
def apk(actual, predicted, k=K):
    if len(predicted) > k:
        predicted = predicted[:k]
    score, hits = 0.0, 0
    for idx, p in enumerate(predicted, start=1):
        if p in actual:
            hits += 1
            score += hits/idx
    return score / min(len(actual), k) if actual else 0.0

# --- 5) MAP@10 na pierwszych n_eval_users ---
all_users  = list(true_dict.keys())
eval_users = all_users[:n_eval_users]

map_scores = []
seen_csr = train_interactions.tocsr()

for u in eval_users:
    # retrieval: dot-product
    scores = item_emb.dot(user_emb[u])
    # maskowanie trainowych
    seen = seen_csr[u].indices
    scores[seen] = -np.inf
    # wybór top-K
    topk = np.argpartition(-scores, K)[:K]
    topk = topk[np.argsort(-scores[topk])]
    # AP@K
    map_scores.append(apk(true_dict[u], topk, k=K))

print(f"MAP@{K} na pierwszych {n_eval_users} użytkownikach: {np.mean(map_scores):.4f}")
