In [3]:
pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 26.9 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-linux_x86_64.whl size=705371 sha256=6c133f48646319c469c5aa02f1f3c18828b96ce0c43d992fb7154662fabb2bcb
  Stored in directory: /root/.cache/pip/wheels/f8/56/28/5772a3bd3413d65f03aa452190b00898b680b10028a1021914
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [4]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM
import multiprocessing as mp

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import functional as F
import random
from pathlib import Path
import os

# Datasets

In [6]:
datasets_path = '/content/drive/MyDrive/'
datasets_path = Path(datasets_path)

!ls '/content/drive/MyDrive/'

 beer_dataset	        beer.txt	   data		        github	 tmp
 beer_reviews.csv.zip  'Colab Notebooks'   DeepLearn-main.zip   ssh


In [7]:
def evaluate(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    HT = 0.0
    valid_user = 0.0

    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:

        if len(train[u]) < 1 or len(test[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        seq[idx] = valid[u][0]
        idx -= 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break
        rated = set(train[u])
        rated.add(0)
        item_idx = [test[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0] # - for 1st argsort DESC

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user


# evaluate on val set
def evaluate_valid(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    valid_user = 0.0
    HT = 0.0
    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break

        rated = set(train[u])
        rated.add(0)
        item_idx = [valid[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0]

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user

In [8]:
# ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
#         names=['user_id', 'beer_id', 'rating', 'timestamp'], 
#         usecols=['user_id', 'beer_id', 'rating'], engine='python')

# beer_info = pd.read_csv('ml-1m/beers.dat', delimiter='::', header=None, 
#         names=['beer_id', 'name', 'category'], engine='python')
file = 'beer.txt'#'Beauty.txt'#   Steam.txt  Video.txt  ml-1m.txt
        
print('dataset', file)
users = []
beers = []
with open(datasets_path / file, 'r') as data:
    lines = data.readlines()

    for line in lines:
        user_beer = line.split(' ')
        user = int(user_beer[0])
        beer = int(user_beer[1])

        users.append(user)
        beers.append(beer)

df_data = {
    'user_id': users,
    'beer_id': beers
}
interactions = pd.DataFrame.from_dict(df_data)
ratings = interactions

dataset beer.txt


In [9]:
#ratings = ratings.loc[(ratings['rating'] >= 4)]
users = ratings["user_id"]
beers = ratings["beer_id"]
user_item = sp.coo_matrix((np.ones_like(users), (users, beers)))
user_item_csr = user_item.tocsr()

In [10]:
unique_users = np.unique(users)
unique_beers = set(np.unique(beers))

grouped_interactions = ratings.groupby('user_id')['beer_id'].apply(list)

train_dataset = {}
test_dataset = {}
negative_dataset = {}

for user_id, user_beers in grouped_interactions.iteritems():
    if len(user_beers) < 2:
        continue

    train_dataset[user_id] = user_beers[:-1]
    test_dataset[user_id] = user_beers[-1]
    negative_dataset[user_id] =  list(unique_beers - set(user_beers))
    
total_users = list(test_dataset.keys())

In [11]:
len(train_dataset), len(total_users)

(32908, 32908)

In [12]:
def extract_csr_data(interactions):
    coo_users = []
    for user_id in interactions:
        coo_users.append(np.full(len(interactions[user_id]), user_id))
    
    coo_users = np.hstack(coo_users)
    coo_beers = []
    for user_id in interactions:
        coo_beers.append(np.array(interactions[user_id]))
    
    coo_beers = np.hstack(coo_beers)
    
    user_item = sp.coo_matrix((np.ones_like(coo_users), (coo_users, coo_beers)))
    return user_item.tocsr()

train_data = extract_csr_data(train_dataset)

# Evaluation 

Аналогично статье из NCF будем сравнивать все модели по метрикам Hit rate(HR@K) и NDCG@K. K = 10
Помимо одного позитива, также добавим 99 случайных негативных фильмов для пользователя, тем самым будем оценивать эти метрики относительно ранжирования этих 1 + 99 фильмов.

In [13]:
def evaluate_metrics_for_user(args):
    k = 10
    model, user_id = args
    if user_id not in negative_dataset[user_id]:
        return None
    
    last_user_beer = test_dataset[user_id]
    np.random.shuffle(negative_dataset[user_id])
    random_negative_beers = negative_dataset[user_id][:99]

    input_beers = np.array([last_user_beer] + list(random_negative_beers))
    input_user = np.full(len(input_beers), user_id)
    
    pred = model.predict(input_user, input_beers)

    top_beers = input_beers[np.argsort(pred)[-k:]]

    hit_rate = 1 if last_user_beer in top_beers else 0

    ndcg = 0 # TODO? 1 / np.log2(rank + 2)
    for position, beer in enumerate(top_beers):
        if beer == last_user_beer:
            ndcg = 1 / np.log2(position + 2)
            break
    return hit_rate, ndcg

def evaluate_model(model):
    with mp.Pool(mp.cpu_count()) as pool:
        users_len = len(total_users)
        metrics = pool.map(evaluate_metrics_for_user, zip([model] * users_len, total_users))
        hrs = [metric[0] for metric in metrics if metric is not None]
        ndcgs = [metric[1] for metric in metrics if metric is not None]

    print('Mean HR', np.mean(hrs))
    print('Mean NDCG', np.mean(ndcgs))

In [14]:
baseline_warp = LightFM(
    no_components=64, 
    learning_rate=0.01,
    loss='warp',
    max_sampled=200
)

baseline_warp.fit(train_data, epochs=40, verbose=True, num_threads=mp.cpu_count())

Epoch: 100%|██████████| 40/40 [10:31<00:00, 15.79s/it]


<lightfm.lightfm.LightFM at 0x7f56d7dbd490>

In [15]:
evaluate_model(baseline_warp)

Mean HR 0.7635184508586043
Mean NDCG 0.26705864383733013


In [16]:
baseline_bpr = LightFM(
    no_components=64, 
    learning_rate=0.01,
    loss='bpr',
    max_sampled=200
)

baseline_bpr.fit(train_data, epochs=40, verbose=True, num_threads=mp.cpu_count())

Epoch: 100%|██████████| 40/40 [01:49<00:00,  2.73s/it]


<lightfm.lightfm.LightFM at 0x7f56d7dc4810>

In [17]:
evaluate_model(baseline_bpr)

Mean HR 0.7637924735111435
Mean NDCG 0.25274897526772244


In [19]:
beer_info = pd.read_csv('/content/drive/MyDrive/beer_dataset/beer_reviews.csv')

In [20]:
beer_info.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [25]:
class Recommender:
    
    def __init__(self, model, user_emb, item_emb, bias_u=None, bias_i=None):
        self.model = model
        self.user_emb = user_emb
        self.user_bias = bias_u
        self.item_emb = item_emb
        self.item_bias = bias_i
        
    def predict(self, users, beers):
        return self.model.predict(users, beers)
    
    def similars(self, toy_beer_id=1, top=10):
        input_vector = self.item_emb[toy_beer_id]

        data = []
        for item_idx, column in enumerate(self.item_emb):
            dst = np.linalg.norm(column - input_vector)
            data.append((item_idx, dst))

        sorted_by_dst = list(sorted(data, key=lambda val: val[1]))

        similars = []
        for item in sorted_by_dst:
            search = beer_info[beer_info["beer_name"] == item[0]]
            beer_name = search["beer_name"].to_string()
            if len(search) > 0:
                similars.append((item[0], beer_name))

        return similars[:top]

    def recommend(self, user_id=4, top=10):
        new_beer_ids = negative_dataset[user_id]

        data = []
        for beer_id in new_beer_ids:
            bias_w = self.user_bias[user_id] if self.user_bias is not None else 0
            bias_h = self.item_bias[beer_id] if self.item_bias is not None else 0

            dot = np.dot(self.user_emb[user_id], self.item_emb[beer_id])
            data.append((beer_id, dot + bias_w + bias_h))

        data = list(sorted(data, key=lambda val: val[1], reverse=True))
        recommendations = [beer_info[beer_info["beer_beerid"] == x[0]]["beer_name"].to_string() for x in data]
        return recommendations[:top]

# Baseline MF model: LightFM warp

In [26]:
baseline_recommender = Recommender(
    baseline_warp, 
    baseline_warp.user_embeddings, 
    baseline_warp.item_embeddings,
    baseline_warp.user_biases,
    baseline_warp.item_biases
)