In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm

### v1

In [None]:
class ReorderRateClassificationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.top_reordered_items_n = top_reodered_items_n

        self._get_orders(orders_df)
        self._get_order_products(order_products_df)

        self.user_id = user_id
        self._get_user_features(self.user_id)

        self._get_products_features()

    def _get_orders(self, orders_df: pd.DataFrame):
        self.orders = orders_df  # меняется на запрос

    def _get_order_products(self, order_products_df: pd.DataFrame):
        self.orders_products = order_products_df  # меняется на запрос

    def _get_last_order_products(self, user_id: int) -> pd.Series:
        prior_orders = self.orders[self.orders["eval_set"] == "prior"]
        prior_user_orders = prior_orders[self.orders["user_id"] == user_id]

        if prior_user_orders.empty:
            return pd.Series([], dtype=int)

        last_prior_user_order_id = prior_user_orders.loc[
            prior_user_orders["order_number"].idxmax(), "order_id"
        ]
        last_order_products = self.orders_products[
            self.orders_products["order_id"] == last_prior_user_order_id
        ]["product_id"]

        return last_order_products

    def _get_top_reordered_items(self) -> pd.Series:
        products_agg = (
            self.orders_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        products_agg.reset_index(inplace=True)

        top_reordered_items = products_agg.sort_values(
            by="product_reorder_rate", ascending=False
        )[: self.top_reordered_items_n]

        return top_reordered_items["product_id"]

    def _get_products_features(self):
        self.df_products_features = pd.DataFrame()

        top_reordered_items = self._get_top_reordered_items().values
        self.df_products_features["top_reordered"] = top_reordered_items

    def _get_user_features(self, user_id: int):
        self.df_user_features = pd.DataFrame()

        last_order_products = self._get_last_order_products(user_id).values
        self.df_user_features["last_user_order"] = last_order_products

    def predict(self, product_id: int, user_id: int = None):
        # Меняется на
        #     features = self.get_features(user_id. product_id)
        #     return lgb_model.predict(features)

        if user_id is not None:
            self.user_id = user_id
            self._get_user_features(self.user_id)

        if product_id in self.df_user_features["last_user_order"].values:
            return True

        if product_id in self.df_products_features["top_reordered"].values:
            return True

        return False

In [None]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_df = pd.read_csv(data_path + "order_products__prior.csv")

In [None]:
clf_model = ReorderRateClassificationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

In [None]:
clf_model.predict(6433)

In [None]:
class ReorderRateRecommendationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.user_id = user_id

        self.clf_model = ReorderRateClassificationModel(
            user_id, orders_df, order_products_df, top_reodered_items_n
        )

    def predict(self):
        recommendations_list = []
        products_id = self.clf_model.orders["order_id"].values
        for product_id in tqdm(products_id):
            pred = self.clf_model.predict(product_id)
            if pred:
                recommendations_list.append(product_id)

        return np.random.choice(
            recommendations_list, size=clf_model.top_reordered_items_n, replace=False
        )

In [None]:
rec_model = ReorderRateRecommendationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

In [None]:
rec_model.predict()


### v2

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd

IDIR = "../data/"

Загрузка таблиц (меняется на запросы к бд)

In [None]:
def get_data():
    priors = pd.read_csv(
        IDIR + "order_products__prior.csv",
        dtype={
            "order_id": np.int32,
            "product_id": np.uint16,
            "add_to_cart_order": np.int16,
            "reordered": np.int8,
        },
    )

    print("loading train")
    train = pd.read_csv(
        IDIR + "order_products__train.csv",
        dtype={
            "order_id": np.int32,
            "product_id": np.uint16,
            "add_to_cart_order": np.int16,
            "reordered": np.int8,
        },
    )

    print("loading orders")
    orders = pd.read_csv(
        IDIR + "orders.csv",
        dtype={
            "order_id": np.int32,
            "user_id": np.int32,
            "eval_set": "category",
            "order_number": np.int16,
            "order_dow": np.int8,
            "order_hour_of_day": np.int8,
            "days_since_prior_order": np.float32,
        },
    )

    print("loading products")
    products = pd.read_csv(
        IDIR + "products.csv",
        dtype={
            "product_id": np.uint16,
            "order_id": np.int32,
            "aisle_id": np.uint8,
            "department_id": np.uint8,
        },
        usecols=["product_id", "aisle_id", "department_id"],
    )

    print("priors {}: {}".format(priors.shape, ", ".join(priors.columns)))
    print("orders {}: {}".format(orders.shape, ", ".join(orders.columns)))
    print("train {}: {}".format(train.shape, ", ".join(train.columns)))

    return priors, train, orders, products

In [None]:
priors, train, orders, products = get_data()

Добавление информации об orders в prior

In [None]:
def extend_priors(priors, orders):
    orders.set_index("order_id", inplace=True, drop=False)
    priors = priors.join(orders, on="order_id", rsuffix="_")
    priors.drop("order_id_", inplace=True, axis=1)

    return priors

In [None]:
priors = extend_priors(priors, orders)

In [None]:
priors

Фичи продуктов

In [None]:
def get_product_features(priors, products):
    prods = pd.DataFrame()
    prods["orders"] = priors.groupby(priors.product_id).size().astype(np.int32)
    prods["reorders"] = (
        priors["reordered"].groupby(priors.product_id).sum().astype(np.float32)
    )
    prods["reorder_rate"] = (prods.reorders / prods.orders).astype(np.float32)
    products = products.join(prods, on="product_id")
    products.set_index("product_id", drop=False, inplace=True)
    del prods

    return products

In [None]:
products = get_product_features(priors, products)

In [None]:
products

Фичи пользователей

In [None]:
def get_user_features(priors, orders):
    usr = pd.DataFrame()
    usr["average_days_between_orders"] = (
        orders.groupby("user_id")["days_since_prior_order"].mean().astype(np.float32)
    )
    usr["nb_orders"] = orders.groupby("user_id").size().astype(np.int16)

    users = pd.DataFrame()
    users["total_items"] = priors.groupby("user_id").size().astype(np.int16)
    users["all_products"] = priors.groupby("user_id")["product_id"].apply(set)
    users["total_distinct_items"] = (users.all_products.map(len)).astype(np.int16)

    users = users.join(usr)
    del usr

    users["average_basket"] = (users.total_items / users.nb_orders).astype(np.float32)

    return users

In [None]:
users = get_user_features(priors, orders)

In [None]:
users

Фичи пользователь-продукт

In [None]:
def get_userXproduct_features(priors):
    priors["user_product"] = priors.product_id + priors.user_id * 100000

    d = dict()
    for row in priors.itertuples():
        z = row.user_product
        if z not in d:
            d[z] = (1, (row.order_number, row.order_id), row.add_to_cart_order)
        else:
            d[z] = (
                d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order,
            )

    userXproduct = pd.DataFrame.from_dict(d, orient="index")
    del d

    userXproduct.columns = ["nb_orders", "last_order_id", "sum_pos_in_cart"]
    userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
    userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(
        np.int32
    )
    userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
    print("user X product f", len(userXproduct))

    del priors

    return userXproduct

In [None]:
userXproduct = get_userXproduct_features(priors)

In [None]:
userXproduct

Формирование датасета

In [1]:
def get_all_features(
    selected_orders, orders, userXproduct, users, products, train, labels_given=False
):
    order_list = []
    product_list = []
    labels = []
    i = 0

    for row in selected_orders.itertuples():
        i += 1
        if i % 10000 == 0:
            print("order row", i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)

        # Формирование таргета (1 если продукт есть в истории заказов пользователя, 0 - иначе)

        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]

    # Создание датафрейма с номерами заказов и всеми продуктами, которые заказывал пользователь, относящийся к этому заказу

    df = pd.DataFrame(
        {"order_id": order_list, "product_id": product_list}, dtype=np.int32
    )
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list

    print("Добавление фичей пользователя (из users)")

    df["user_id"] = df.order_id.map(orders.user_id)

    df["user_total_orders"] = df.user_id.map(users.nb_orders)
    df["user_total_items"] = df.user_id.map(users.total_items)
    df["total_distinct_items"] = df.user_id.map(users.total_distinct_items)
    df["user_average_days_between_orders"] = df.user_id.map(
        users.average_days_between_orders
    )
    df["user_average_basket"] = df.user_id.map(users.average_basket)

    print("Добавления фичей заказа (из orders)")

    # df['dow'] = df.order_id.map(orders.order_dow)
    df["order_hour_of_day"] = df.order_id.map(orders.order_hour_of_day)
    df["days_since_prior_order"] = df.order_id.map(orders.days_since_prior_order)
    df["days_since_ratio"] = (
        df.days_since_prior_order / df.user_average_days_between_orders
    )

    print("Добавление фичей продукта (из products)")

    df["aisle_id"] = df.product_id.map(products.aisle_id)
    df["department_id"] = df.product_id.map(products.department_id)
    df["product_orders"] = df.product_id.map(products.orders).astype(np.int32)
    df["product_reorders"] = df.product_id.map(products.reorders)
    df["product_reorder_rate"] = df.product_id.map(products.reorder_rate)

    print("Добавление фичей пользователь-продукт (из userXproduct)")

    df["z"] = df.user_id * 100000 + df.product_id
    df.drop(["user_id"], axis=1, inplace=True)

    df["UP_orders"] = df.z.map(userXproduct.nb_orders)
    df["UP_orders_ratio"] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df["UP_last_order_id"] = df.z.map(userXproduct.last_order_id)
    df["UP_average_pos_in_cart"] = (
        df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders
    ).astype(np.float32)
    df["UP_reorder_rate"] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df["UP_orders_since_last"] = df.user_total_orders - df.UP_last_order_id.map(
        orders.order_number
    )
    df["UP_delta_hour_vs_last"] = (
        abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day))
        .map(lambda x: min(x, 24 - x))
        .astype(np.int8)
    )

    # Убираем точно ненужные столбцы

    df.drop(["UP_last_order_id"], axis=1, inplace=True)
    df.drop(["z"], axis=1, inplace=True)

    # Выбранные фичи

    f_to_use = [
        "user_total_orders",
        "user_total_items",
        "total_distinct_items",
        "user_average_days_between_orders",
        "user_average_basket",
        "order_hour_of_day",
        "days_since_prior_order",
        "days_since_ratio",
        "aisle_id",
        "department_id",
        "product_orders",
        "product_reorders",
        "product_reorder_rate",
        "UP_orders",
        "UP_orders_ratio",
        "UP_average_pos_in_cart",
        "UP_reorder_rate",
        "UP_orders_since_last",
        "UP_delta_hour_vs_last",
    ]

    return (df[f_to_use], labels)

In [None]:
test_orders = orders[orders.eval_set == "test"]
train_orders = orders[orders.eval_set == "train"]

train.set_index(
    ["order_id", "product_id"], inplace=True, drop=False
)  # order_products__train

In [None]:
df_train, labels = get_all_features(
    train_orders, orders, userXproduct, users, products, train, labels_given=True
)

train_dataset = lgb.Dataset(
    df_train, label=labels, categorical_feature=["aisle_id", "department_id"]
)

del df_train

Обучение модели

In [None]:
def train_model(train_dataset, params={}, rounds=10):

    model = lgb.train(params, train_dataset, rounds)
    lgb.plot_importance(model, figsize=(6, 8))

    del train_dataset

    return model

In [None]:
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": {"binary_logloss"},
    "num_leaves": 96,
    "max_depth": 10,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.95,
    "bagging_freq": 5,
}
rounds = 10

model = train_model(train_dataset, params, rounds)
model.save_model("./models/baseline")

Тестирование

In [None]:
def make_submission(df_test, preds):
    df_test["pred"] = preds

    TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

    d = dict()
    for row in df_test.itertuples():
        if row.pred > TRESHOLD:
            try:
                d[row.order_id] += " " + str(row.product_id)
            except:  # noqa: E722
                d[row.order_id] = str(row.product_id)

    for order in test_orders.order_id:
        if order not in d:
            d[order] = "None"

    sub = pd.DataFrame.from_dict(d, orient="index")

    sub.reset_index(inplace=True)
    sub.columns = ["order_id", "products"]
    sub.to_csv("sub.csv", index=False)

In [None]:
df_test, _ = get_all_features(test_orders)
preds = model.predict(df_test) # "вероятности"

make_submission(df_test, preds)

Инференс

In [None]:
class RecommendationModel:
    def __init__(
        self,
        orders_df: pd.DataFrame,
        order_products_df: pd.DataFrame,
        top_k_products: int = 100,
        recommendations_n: int = 10,
    ):
        self.orders = orders_df
        self.order_products = order_products_df
        self.top_k_products = top_k_products
        self.recommendations_n = recommendations_n

        self.products_agg = (
            self.order_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        self.products_agg.reset_index(inplace=True)

        self.top_reordered_products = (
            self.products_agg.sort_values(by="product_reorder_rate", ascending=False)
            .head(top_k_products)["product_id"]
            .tolist()
        )

    def _get_user_candidates(self, user_id):
        # Получаем последние заказы пользователя
        user_orders = self.orders[self.orders["user_id"] == user_id]
        prior_orders = user_orders[user_orders["eval_set"] == "prior"]

        if prior_orders.empty:
            return self.top_reordered_products

        # Товары из последних заказов пользователя
        last_order_products = self.order_products[
            self.order_products["order_id"].isin(prior_orders["order_id"])
        ]["product_id"].tolist()

        # Комбинируем с топ-товарами
        candidates = list(set(last_order_products + self.top_reordered_products))
        return candidates

    def predict(self, user_id):
        # Получаем список кандидатов для пользователя
        candidates = self._get_user_candidates(user_id)

        # Составляем DataFrame с фичами для кандидатов
        features = self._generate_features(user_id, candidates)

        # Замените self.model.predict(features) на вашу обученную ML-модель
        predictions = np.random.rand(len(features))  # пример случайного предсказания

        # Ранжируем по предсказанным вероятностям и возвращаем топ-N
        top_indices = predictions.argsort()[-self.recommendations_n :][::-1]
        recommended_products = [candidates[i] for i in top_indices]
        return recommended_products

    def _generate_features(self, user_id, candidates):
        # Пример генерации фичей (замените на свои):
        features = pd.DataFrame({"product_id": candidates})
        features["product_reorder_rate"] = features["product_id"].map(
            self.products_agg.set_index("product_id")["product_reorder_rate"]
        )
        # Добавьте больше фичей, если нужно
        return features

In [None]:
rec_model = RecommendationModel(orders_df, order_products_df)