In [1]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from chrono_split import chrono_split

In [2]:
RATINGS_PATH = Path("../data/goodreads_fantasy/goodreads_reviews_fantasy_paranormal.csv")
FEATURES_PATH=Path("../data/goodreads/goodreads_book_genres_initial.csv")
DATA_LABEL="goodreads"

In [3]:
if DATA_LABEL=="goodreads":
    sep=","
    usecols_ratings=["book_id", "date_updated", "rating", "user_id"]
    usecols_features=None
elif DATA_LABEL=="kinopoisk":
    sep="|"
    usecols_ratings=None
    usecols_features=["id", "genre"]

In [4]:
ratings=pd.read_csv(
    filepath_or_buffer=RATINGS_PATH,
    sep=sep,
    usecols=usecols_ratings
)
features=pd.read_csv(
    filepath_or_buffer=FEATURES_PATH,
    sep=sep,
    usecols=usecols_features
)

In [5]:
ratings.head()

Unnamed: 0,book_id,date_updated,rating,user_id
0,18245960,Wed Aug 30 00:00:26 -0700 2017,5,8842281e1d1347389f2ab93d60773d4d
1,5577844,Wed Oct 01 00:31:56 -0700 2014,5,8842281e1d1347389f2ab93d60773d4d
2,17315048,Wed Mar 22 11:33:10 -0700 2017,5,8842281e1d1347389f2ab93d60773d4d
3,13453029,Sat Jul 26 11:43:28 -0700 2014,4,8842281e1d1347389f2ab93d60773d4d
4,13239822,Wed Mar 22 11:32:20 -0700 2017,3,8842281e1d1347389f2ab93d60773d4d


In [6]:
features.head()

Unnamed: 0,item_id,genres
0,5333265,"history, historical fiction, biography"
1,1333909,"fiction, history, historical fiction, biography"
2,7327624,"fantasy, paranormal, fiction, mystery, thrille..."
3,6066819,"fiction, romance, mystery, thriller, crime"
4,287140,non-fiction


Для обучения рекомендательных алгоритмов нужны только айдишники пользователя и товара, оценка и дата. Приведу их названия к унифицированному виду для дальнейшего удобства.

In [9]:
if DATA_LABEL=="goodreads":
    colnames_ratings_map={"book_id": "item_id", "date_updated": "timestamp"}
    colnames_features_map={"genres": "features"}
elif DATA_LABEL=="kinopoisk":
    colnames_ratings_map={"movie_id": "item_id", "time": "timestamp", "score": "rating"}
    colnames_features_map={"id": "item_id", "genre": "features"}

In [10]:
ratings.rename(columns=colnames_ratings_map, inplace=True)
features.rename(columns=colnames_features_map, inplace=True)

Удостоверюсь, что в данных будут только айтемы с признаками. Уберу из датасета с признаками айтемы без признаков.

In [11]:
features=features[~features.features.isna()]

In [12]:
if DATA_LABEL=="kinopoisk":
    features.loc[:, "features"]=features.features.apply(lambda x: "|".join(eval(x)))
else:
    features.loc[:, "features"]=features.features.apply(lambda x: "|".join(x.split(", ")))

In [13]:
features.head()

Unnamed: 0,item_id,features
0,5333265,history|historical fiction|biography
1,1333909,fiction|history|historical fiction|biography
2,7327624,fantasy|paranormal|fiction|mystery|thriller|cr...
3,6066819,fiction|romance|mystery|thriller|crime
4,287140,non-fiction


In [14]:
features=features[features.features!=""]

Уберу интеракции, где признаков нет.

In [15]:
ratings=ratings[ratings.item_id.isin(features.item_id)&ratings.rating!=0]

Трансформирую дату к корректному виду.

In [19]:
if DATA_LABEL=="goodreads":
    utc=True
    unit=None
elif DATA_LABEL=="kinopoisk":
    utc=None
    unit="s"

In [20]:
ratings.loc[:, "timestamp"]=pd.to_datetime(ratings.loc[:, "timestamp"], unit=unit, utc=utc)

In [21]:
ratings.head()

Unnamed: 0,item_id,timestamp,rating,user_id
0,18245960,2017-08-30 07:00:26+00:00,5,8842281e1d1347389f2ab93d60773d4d
1,5577844,2014-10-01 07:31:56+00:00,5,8842281e1d1347389f2ab93d60773d4d
2,17315048,2017-03-22 18:33:10+00:00,5,8842281e1d1347389f2ab93d60773d4d
4,13239822,2017-03-22 18:32:20+00:00,3,8842281e1d1347389f2ab93d60773d4d
5,62291,2017-03-22 18:47:02+00:00,5,8842281e1d1347389f2ab93d60773d4d


In [22]:
if DATA_LABEL=="kinopoisk":
    ratings=ratings[ratings.timestamp.dt.year >= 2020]

Проверю на наличие дубликатов отзывов.

In [23]:
ratings[ratings.duplicated(["user_id", "item_id"], keep = False)]

Unnamed: 0,item_id,timestamp,rating,user_id


Удалю дубликаты, оставив самую актуальную оценку.

In [24]:
ratings.sort_values("timestamp", inplace = True)
ratings = ratings[~ratings.duplicated(["user_id", "item_id"], keep = "last")]
ratings.reset_index(drop = True, inplace = True)

In [25]:
ratings.head()

Unnamed: 0,item_id,timestamp,rating,user_id
0,2,2006-12-06 11:12:49+00:00,5,7b00e728d1ce3c95a16b90e389dfbb90
1,5907,2007-02-06 18:05:31+00:00,5,6a22bb07026c9bb6f17922d2d61116ae
2,34493,2007-02-15 03:26:51+00:00,3,39fbda7c84be941c0b616df0413be729
3,104359,2007-02-20 06:05:09+00:00,5,39fbda7c84be941c0b616df0413be729
4,49627,2007-02-23 01:45:53+00:00,5,416ca9250de82ccdba40d355a369c8b4


Статистика по датасету:

In [26]:
# Goodreads Fantasy
n_users = len(ratings["user_id"].unique())
n_items = len(ratings["item_id"].unique())
n_reviews = len(ratings)

print(f"Users: {n_users}")
print(f"Items: {n_items}")
print(f"Reviews: {n_reviews}")
print(f"Reviews/Users: {round(n_reviews/n_users, 2)}")
print(f"Density: {round(n_reviews/(n_users * n_items) * 100, 3)}%")

Users: 217937
Items: 203227
Reviews: 1948378
Reviews/Users: 8.94
Density: 0.004%


In [36]:
# Kinopoisk
n_users = len(ratings["user_id"].unique())
n_items = len(ratings["item_id"].unique())
n_reviews = len(ratings)

print(f"Users: {n_users}")
print(f"Items: {n_items}")
print(f"Reviews: {n_reviews}")
print(f"Reviews/Users: {round(n_reviews/n_users, 2)}")
print(f"Density: {round(n_reviews/(n_users * n_items) * 100, 3)}%")

Users: 34860
Items: 57838
Reviews: 3079710
Reviews/Users: 88.35
Density: 0.153%


Разделю данные на тренировочную и тестовую выборки по хронологическому принципу. Для этого для начала посмотрю на распределение интеракций по годам.

In [27]:
ratings.timestamp.dt.year.value_counts().sort_index()

2006         1
2007      5156
2008     24751
2009     41267
2010     68846
2011    127374
2012    192538
2013    243305
2014    257349
2015    304819
2016    360976
2017    321996
Name: timestamp, dtype: int64

In [70]:
if DATA_LABEL=="goodreads":
    val_threshold, test_threshold="01-01-2015", "01-01-2016"
elif DATA_LABEL=="kinopoisk":
    val_threshold, test_threshold="01-01-2021", "02-01-2021"

In [71]:
train, val, test = chrono_split(
    data=ratings,
    user_col="user_id",
    item_col="item_id",
    timestamp_col="timestamp",
    val_threshold=val_threshold,
    test_threshold=test_threshold,
    min_train_ratings=2,
    min_test_ratings=10
)

In [76]:
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train.drop(columns="timestamp", inplace=True)
val.drop(columns="timestamp", inplace=True)
test.drop(columns="timestamp", inplace=True)

In [77]:
train=train.loc[:,["user_id","item_id","rating"]]
val=val.loc[:,["user_id","item_id","rating"]]
test=test.loc[:,["user_id","item_id","rating"]]

Уберу из датасета с признаками айтемы, которых нет в трейне.

In [78]:
features=features[features.item_id.isin(train.item_id)]

In [79]:
features.reset_index(drop=True, inplace=True)

Трансформирую оригинальные айдишники пользователей и товаров в целые числа.

In [80]:
user_id_encoder, item_id_encoder=LabelEncoder(), LabelEncoder()

In [81]:
train.loc[:, "user_id"]=user_id_encoder.fit_transform(train.user_id)
train.loc[:, "item_id"]=item_id_encoder.fit_transform(train.item_id)

val.loc[:, "user_id"]=user_id_encoder.transform(val.user_id)
val.loc[:, "item_id"]=item_id_encoder.transform(val.item_id)

test.loc[:, "user_id"]=user_id_encoder.transform(test.user_id)
test.loc[:, "item_id"]=item_id_encoder.transform(test.item_id)

features.loc[:, "item_id"]=item_id_encoder.transform(features.item_id)

In [82]:
features

Unnamed: 0,item_id,features
0,19330,fantasy|paranormal|fiction|mystery|thriller|cr...
1,8024,fiction|history|historical fiction|biography|f...
2,8023,fantasy|paranormal|fiction|romance|history|his...
3,10656,fiction|fantasy|paranormal|children|non-fiction
4,2410,fiction|fantasy|paranormal|mystery|thriller|crime
...,...,...
60076,4482,fantasy|paranormal|young-adult|fiction|childre...
60077,45919,fiction|fantasy|paranormal|mystery|thriller|cr...
60078,20760,fantasy|paranormal|children|poetry|fiction|you...
60079,50914,fantasy|paranormal|romance


Сохраню оригинальные айдишники.

In [83]:
np.savetxt(RATINGS_PATH.parent.joinpath("user_ids.txt"), user_id_encoder.classes_, fmt="%s", delimiter="\n")
np.savetxt(RATINGS_PATH.parent.joinpath("item_ids.txt"), item_id_encoder.classes_, fmt="%s", delimiter="\n")

Сохраню получившиеся датасеты на диск.

In [84]:
train.to_csv(RATINGS_PATH.parent.joinpath(DATA_LABEL+"_train.csv"), index=False)
val.to_csv(RATINGS_PATH.parent.joinpath(DATA_LABEL+"_dev.csv"), index=False)
test.to_csv(RATINGS_PATH.parent.joinpath(DATA_LABEL+"_test.csv"), index=False)
features.to_csv(RATINGS_PATH.parent.joinpath(DATA_LABEL+"_features.csv"), index=False)