In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

In [3]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

In [4]:
# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()

In [5]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transfor

In [6]:
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8)

In [11]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations

In [None]:
# # преобразуем полученные рекомендации в табличный формат
# item_ids_enc = als_recommendations[0]
# als_scores = als_recommendations[1]

# als_recommendations = pd.DataFrame({
#     "user_id_enc": user_ids_encoded,
#     "item_id_enc": item_ids_enc.tolist(), 
#     "score": als_scores.tolist()})
# als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# # приводим типы данных
# als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
# als_recommendations["score"] = als_recommendations["score"].astype("float")

# # получаем изначальные идентификаторы
# als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
# als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
# als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [12]:
# получим энкодированные идентификаторы всех объектов, известных нам из events_train
train_item_ids_enc = events_train['item_id_enc'].unique()

max_similar_items = 10

# получаем списки похожих объектов, используя ранее полученную ALS-модель
# метод similar_items возвращает и сам объект, как наиболее похожий
# этот объект мы позже отфильтруем, но сейчас запросим на 1 больше
similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items+1)

# преобразуем полученные списки в табличный формат
sim_item_item_ids_enc = similar_items[0]
sim_item_scores = similar_items[1]

similar_items = pd.DataFrame({
    "item_id_enc": train_item_ids_enc,
    "sim_item_id_enc": sim_item_item_ids_enc.tolist(), 
    "score": sim_item_scores.tolist()}
    )
similar_items = similar_items.explode(["sim_item_id_enc", "score"], ignore_index=True)

# приводим типы данных
similar_items["sim_item_id_enc"] = similar_items["sim_item_id_enc"].astype("int")
similar_items["score"] = similar_items["score"].astype("float")

# получаем изначальные идентификаторы
similar_items["item_id_1"] = item_encoder.inverse_transform(similar_items["item_id_enc"])
similar_items["item_id_2"] = item_encoder.inverse_transform(similar_items["sim_item_id_enc"])
similar_items = similar_items.drop(columns=["item_id_enc", "sim_item_id_enc"])

# убираем пары с одинаковыми объектами
similar_items = similar_items.query("item_id_1 != item_id_2")

In [15]:
similar_items[similar_items['item_id_1'] == 7126].sort_values('score', ascending=False)

Unnamed: 0,score,item_id_1,item_id_2
25873,0.948722,7126,7190
25874,0.940994,7126,24280
25875,0.930142,7126,1953
25876,0.925065,7126,58696
25877,0.916334,7126,38296
25878,0.91601,7126,2932
25879,0.913948,7126,7184
25880,0.911431,7126,387749
25881,0.909866,7126,7733
25882,0.909448,7126,30597


In [22]:
items[items['item_id'] == 24280]

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc
395155,7190,Alexandre Dumas,"The Three Musketeers (The D'Artagnan Romances,...",One of the most celebrated and popular histori...,"{'Classics': 9823, 'Fiction': 3256, 'Historica...",786,4.06,198892,3195,Modern Library,2001,US,,Paperback,False,,,"{'Academic': None, 'Academic-Academia': None, ...","Classics 9823, Fiction 3256, Historical-Histor...",844


In [23]:
similar_items.to_parquet("similar_items.parquet") 

In [24]:
def print_sim_items(item_id, similar_items):

    item_columns_to_use = ["item_id", "author", "title", "genre_and_votes", "average_rating", "ratings_count"]
    
    item_id_1 = items.query("item_id == @item_id")[item_columns_to_use]
    display(item_id_1)
    
    si = similar_items.query("item_id_1 == @item_id")
    si = si.merge(items[item_columns_to_use].set_index("item_id"), left_on="item_id_2", right_index=True)
    display(si)

In [27]:
similar_items

Unnamed: 0,score,item_id_1,item_id_2
1,0.922486,22034,22026
2,0.874760,22034,6882
3,0.873759,22034,22028
4,0.850648,22034,364089
5,0.835727,22034,9827
...,...,...,...
456209,0.534905,21847032,19904043
456210,0.515327,21847032,6167746
456211,0.507710,21847032,17908487
456212,0.496354,21847032,6349976


In [26]:
print_sim_items(17245, similar_items)

Unnamed: 0,item_id,author,title,genre_and_votes,average_rating,ratings_count
1058909,17245,"Bram Stoker, Nina Auerbach, David J. Skal",Dracula,"{'Classics': 19603, 'Horror': 10601, 'Fiction'...",3.98,636895


Unnamed: 0,score,item_id_1,item_id_2,author,title,genre_and_votes,average_rating,ratings_count
23937,0.928822,17245,480204,"Gaston Leroux, Alexander Teixeira de Mattos",The Phantom of the Opera,"{'Classics': 7010, 'Fiction': 2103, 'Horror': ...",3.97,144859
23938,0.900337,17245,51496,"Robert Louis Stevenson, Vladimir Nabokov, Merv...",The Strange Case of Dr. Jekyll and Mr. Hyde,"{'Classics': 12342, 'Fiction': 4037, 'Horror':...",3.79,229898
23939,0.898939,17245,93261,Washington Irving,The Legend of Sleepy Hollow,"{'Classics': 2594, 'Horror': 1182, 'Fiction': ...",3.74,26776
23940,0.89771,17245,295,Robert Louis Stevenson,Treasure Island,"{'Classics': 11249, 'Fiction': 4405, 'Adventur...",3.82,274424
23941,0.896468,17245,2623,"Charles Dickens, Marisa Sestino",Great Expectations,"{'Classics': 19645, 'Fiction': 6662, 'Literatu...",3.75,468462
23942,0.895995,17245,18254,"Charles Dickens, Philip Horne, Gerald Dickens",Oliver Twist,"{'Classics': 11450, 'Fiction': 3656, 'Historic...",3.85,235560
23943,0.886901,17245,7190,Alexandre Dumas,"The Three Musketeers (The D'Artagnan Romances,...","{'Classics': 9823, 'Fiction': 3256, 'Historica...",4.06,198892
23944,0.881915,17245,24213,"Lewis Carroll, John Tenniel, Martin Gardner",Alice's Adventures in Wonderland & Through the...,"{'Classics': 11568, 'Fantasy': 6184, 'Fiction'...",4.06,344482
23945,0.878392,17245,2932,"Daniel Defoe, Virginia Woolf",Robinson Crusoe,"{'Classics': 7725, 'Fiction': 3305, 'Adventure...",3.66,181415
23946,0.870233,17245,1953,"Charles Dickens, Richard Maxwell",A Tale of Two Cities,"{'Classics': 20021, 'Fiction': 6969, 'Historic...",3.82,646983


In [None]:
# import requests

# recommendations_url = "http://127.0.0.1:8000"
# features_store_url = "http://127.0.0.1:8010"
# events_store_url = "http://127.0.0.1:8020" 

# headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
# params = {"user_id": 1291250, 'k': 10}

# resp_offline = requests.post(recommendations_url + "/recommendations_offline", headers=headers, params=params)
# resp_online = requests.post(recommendations_url + "/recommendations_online", headers=headers, params=params)
# resp_blended = requests.post(recommendations_url + "/recommendations", headers=headers, params=params)

# recs_offline = resp_offline.json()["recs"]
# recs_online = resp_online.json()["recs"]
# recs_blended = resp_blended.json()["recs"]

# print(recs_offline)
# print(recs_online)
# print(recs_blended)

In [None]:
# def display_items(item_ids):

#     item_columns_to_use = ["item_id", "author", "title", "genre_and_votes", "average_rating", "ratings_count"]
    
#     items_selected = items.query("item_id in @item_ids")[item_columns_to_use]
#     items_selected = items_selected.set_index("item_id").reindex(item_ids)
#     items_selected = items_selected.reset_index()
    
#     display(items_selected)
    
# print("Онлайн-события")
# display_items(event_item_ids)
# print("Офлайн-рекомендации")
# display_items(recs_offline)
# print("Онлайн-рекомендации")
# display_items(recs_online)
# print("Рекомендации")
# display_items(recs_blended)