In [3]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm

### v1

In [None]:
class ReorderRateClassificationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.top_reordered_items_n = top_reodered_items_n

        self._get_orders(orders_df)
        self._get_order_products(order_products_df)

        self.user_id = user_id
        self._get_user_features(self.user_id)

        self._get_products_features()

    def _get_orders(self, orders_df: pd.DataFrame):
        self.orders = orders_df  # меняется на запрос

    def _get_order_products(self, order_products_df: pd.DataFrame):
        self.orders_products = order_products_df  # меняется на запрос

    def _get_last_order_products(self, user_id: int) -> pd.Series:
        prior_orders = self.orders[self.orders["eval_set"] == "prior"]
        prior_user_orders = prior_orders[self.orders["user_id"] == user_id]

        if prior_user_orders.empty:
            return pd.Series([], dtype=int)

        last_prior_user_order_id = prior_user_orders.loc[
            prior_user_orders["order_number"].idxmax(), "order_id"
        ]
        last_order_products = self.orders_products[
            self.orders_products["order_id"] == last_prior_user_order_id
        ]["product_id"]

        return last_order_products

    def _get_top_reordered_items(self) -> pd.Series:
        products_agg = (
            self.orders_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        products_agg.reset_index(inplace=True)

        top_reordered_items = products_agg.sort_values(
            by="product_reorder_rate", ascending=False
        )[: self.top_reordered_items_n]

        return top_reordered_items["product_id"]

    def _get_products_features(self):
        self.df_products_features = pd.DataFrame()

        top_reordered_items = self._get_top_reordered_items().values
        self.df_products_features["top_reordered"] = top_reordered_items

    def _get_user_features(self, user_id: int):
        self.df_user_features = pd.DataFrame()

        last_order_products = self._get_last_order_products(user_id).values
        self.df_user_features["last_user_order"] = last_order_products

    def predict(self, product_id: int, user_id: int = None):
        # Меняется на
        #     features = self.get_features(user_id. product_id)
        #     return lgb_model.predict(features)

        if user_id is not None:
            self.user_id = user_id
            self._get_user_features(self.user_id)

        if product_id in self.df_user_features["last_user_order"].values:
            return True

        if product_id in self.df_products_features["top_reordered"].values:
            return True

        return False

In [13]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_df = pd.read_csv(data_path + "order_products__prior.csv")

In [6]:
clf_model = ReorderRateClassificationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

  prior_user_orders = prior_orders[self.orders["user_id"] == user_id]


In [14]:
clf_model.predict(6433)

True

In [15]:
class ReorderRateRecommendationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.user_id = user_id

        self.clf_model = ReorderRateClassificationModel(
            user_id, orders_df, order_products_df, top_reodered_items_n
        )

    def predict(self):
        recommendations_list = []
        products_id = self.clf_model.orders["order_id"].values
        for product_id in tqdm(products_id):
            pred = self.clf_model.predict(product_id)
            if pred:
                recommendations_list.append(product_id)

        return np.random.choice(
            recommendations_list, size=clf_model.top_reordered_items_n, replace=False
        )

In [16]:
rec_model = ReorderRateRecommendationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

  prior_user_orders = prior_orders[self.orders["user_id"] == user_id]


In [17]:
rec_model.predict()


100%|██████████| 3421083/3421083 [00:29<00:00, 115893.57it/s]


array([35604, 35951,  5868,   196, 25133, 13032, 39992, 14609, 39657,
       38928])

### v2

In [46]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_prior_df = pd.read_csv(data_path + "order_products__prior.csv")

In [48]:
print("add order info to priors")
orders_df.set_index("order_id", inplace=True, drop=False)
order_products_prior_df = order_products_prior_df.join(orders_df, on="order_id", rsuffix="_")
order_products_prior_df.drop("order_id_", inplace=True, axis=1)

add order info to priors


In [49]:
order_products_prior_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,prior,24,2,6,21.0
32434485,3421083,11352,7,0,25247,prior,24,2,6,21.0
32434486,3421083,4600,8,0,25247,prior,24,2,6,21.0
32434487,3421083,24852,9,1,25247,prior,24,2,6,21.0


In [None]:
users_priod_products_df = pd.DataFrame()
users_priod_products_df["all_products"] = order_products_prior_df.groupby("user_id")["product_id"].apply(set)

In [56]:
users_priod_products_df.head()

Unnamed: 0_level_0,all_products
user_id,Unnamed: 1_level_1
1,"{17122, 196, 26405, 46149, 14084, 13032, 26088..."
2,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
3,"{17668, 44683, 48523, 21903, 14992, 21137, 324..."
4,"{21573, 42329, 17769, 35469, 37646, 1200, 1905..."
5,"{11777, 40706, 28289, 48775, 20754, 6808, 1398..."


In [57]:
train_orders = orders_df[orders_df["eval_set"] == "train"]

In [94]:
def get_all_pairs(selected_orders, is_train: bool = False):

    order_list = []
    product_list = []
    labels = []

    for row in selected_orders.itertuples():
        order_id = row.order_id
        user_id = row.user_id
        user_products = users_priod_products_df.loc[user_id, "all_products"]
        product_list += user_products
        order_list += [order_id] * len(user_products)

        if is_train:
            labels += [order_products_prior_df.loc[order_id, product]["reordered"] for product in user_products]

    pairs = pd.DataFrame({
        "order_id": order_list,
        "product_id": product_list
    }, dtype=np.int32)

    pairs["user_id"] = pairs.order_id.map(selected_orders.user_id)
    pairs.drop(columns=["order_id"], inplace=True)

    return pairs, labels

In [96]:
order_products_prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [99]:
order_products_prior_df.loc[(order_products_prior_df["order_id"] == 2) & (order_products_prior_df["product_id"] == 33120)]["reordered"]

0    1
Name: reordered, dtype: int64

In [95]:
pairs = get_all_pairs(train_orders, True)

KeyError: 17122

In [89]:
pairs

Unnamed: 0,product_id,user_id
0,17122,1
1,196,1
2,26405,1
3,46149,1
4,14084,1
...,...,...
8474656,31477,206209
8474657,28156,206209
8474658,41213,206209
8474659,890,206209


In [90]:
def get_features(pairs):

    products_agg = (
        order_products_prior_df.groupby("product_id")
        .agg(
            {
                "reordered": "mean",
            }
        )
        .rename(
            columns={
                "reordered": "product_reorder_rate",
            }
        )
    )
    products_agg.reset_index(inplace=True)

    pairs["product_reorder_rate"] = pairs["product_id"].map(
        products_agg.set_index("product_id")["product_reorder_rate"]
    )

    return pairs


In [91]:
features = get_features(pairs)

In [93]:
features[features["product_id"] == 196]

Unnamed: 0,product_id,user_id,product_reorder_rate
1,196,1,0.77648
719,196,21,0.77648
1496,196,43,0.77648
1867,196,52,0.77648
2505,196,67,0.77648
...,...,...,...
8465947,196,205993,0.77648
8469570,196,206087,0.77648
8469674,196,206093,0.77648
8470964,196,206119,0.77648


In [None]:
print("computing user f")
usr = pd.DataFrame()
usr["average_days_between_orders"] = (
    orders.groupby("user_id")["days_since_prior_order"].mean().astype(np.float32)
)
usr["nb_orders"] = orders.groupby("user_id").size().astype(np.int16)

users = pd.DataFrame()
users["total_items"] = priors.groupby("user_id").size().astype(np.int16)
users["all_products"] = priors.groupby("user_id")["product_id"].apply(set)
users["total_distinct_items"] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
users["average_basket"] = (users.total_items / users.nb_orders).astype(np.float32)
print("user f", users.shape)

In [None]:
def _get_last_order_products(self, user_id: int) -> pd.Series:
    prior_orders = self.orders[self.orders["eval_set"] == "prior"]
    prior_user_orders = prior_orders[self.orders["user_id"] == user_id]

    if prior_user_orders.empty:
        return pd.Series([], dtype=int)

    last_prior_user_order_id = prior_user_orders.loc[
        prior_user_orders["order_number"].idxmax(), "order_id"
    ]
    last_order_products = self.orders_products[
        self.orders_products["order_id"] == last_prior_user_order_id
    ]["product_id"]

    return last_order_products

In [None]:
def get_features(
    orders_df: pd.DataFrame,
    order_products_df: pd.DataFrame,
    selected_orders: pd.DataFrame,
    for_training: bool = False,
    train_indices: set = None
) -> tuple[pd.DataFrame, pd.Series | None]:
    """
    Генерация фичей для пар user_id и product_id.
    """

    order_list = []
    product_list = []
    labels = [] if for_training else None

    for row in selected_orders.itertuples():
        order_id = row.order_id
        user_id = row.user_id
        user_products = users_df.loc[user_id, "all_products"]
        product_list += user_products
        order_list += [order_id] * len(user_products)

        if for_training:
            labels += [(order_id, product) in train_indices for product in user_products]

    features = pd.DataFrame({
        "order_id": order_list,
        "product_id": product_list
    }, dtype=np.int32)

    # Генерация признаков
    features["user_id"] = features["order_id"].map(orders_df["user_id"])
    features["user_total_orders"] = features["user_id"].map(users_df["nb_orders"])
    features["product_reorder_rate"] = features["product_id"].map(
        products_df.set_index("product_id")["product_reorder_rate"]
    )

    if for_training:
        return features, pd.Series(labels, dtype=np.int8)
    return features, None

In [None]:
def get_features(
    order_df,
    order_products_df,
    is_train,
):
    
    pass


In [None]:
def get_features(
    user_ids: list[int],
    product_ids: list[int],
    order_products: pd.DataFrame,
):

    products_agg = (
        order_products.groupby("product_id")
        .agg(
            {
                "reordered": "mean",
            }
        )
        .rename(
            columns={
                "reordered": "product_reorder_rate",
            }
        )
    )
    products_agg.reset_index(inplace=True)

    features = pd.DataFrame({"product_id": candidates})
    features["product_reorder_rate"] = features["product_id"].map(
        products_agg.set_index("product_id")["product_reorder_rate"]
    )
    # Добавьте больше фичей, если нужно
    return features

In [None]:
get_features()

In [None]:
class RecommendationModel:
    def __init__(
        self,
        orders_df: pd.DataFrame,
        order_products_df: pd.DataFrame,
        top_k_products: int = 100,
        recommendations_n: int = 10,
    ):
        self.orders = orders_df
        self.order_products = order_products_df
        self.top_k_products = top_k_products
        self.recommendations_n = recommendations_n

        self.products_agg = (
            self.order_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        self.products_agg.reset_index(inplace=True)

        self.top_reordered_products = (
            self.products_agg.sort_values(by="product_reorder_rate", ascending=False)
            .head(top_k_products)["product_id"]
            .tolist()
        )

    def _get_user_candidates(self, user_id):
        # Получаем последние заказы пользователя
        user_orders = self.orders[self.orders["user_id"] == user_id]
        prior_orders = user_orders[user_orders["eval_set"] == "prior"]

        if prior_orders.empty:
            return self.top_reordered_products

        # Товары из последних заказов пользователя
        last_order_products = self.order_products[
            self.order_products["order_id"].isin(prior_orders["order_id"])
        ]["product_id"].tolist()

        # Комбинируем с топ-товарами
        candidates = list(set(last_order_products + self.top_reordered_products))
        return candidates

    def predict(self, user_id):
        # Получаем список кандидатов для пользователя
        candidates = self._get_user_candidates(user_id)

        # Составляем DataFrame с фичами для кандидатов
        features = self._generate_features(user_id, candidates)

        # Замените self.model.predict(features) на вашу обученную ML-модель
        predictions = np.random.rand(len(features))  # пример случайного предсказания

        # Ранжируем по предсказанным вероятностям и возвращаем топ-N
        top_indices = predictions.argsort()[-self.recommendations_n :][::-1]
        recommended_products = [candidates[i] for i in top_indices]
        return recommended_products

    def _generate_features(self, user_id, candidates):
        # Пример генерации фичей (замените на свои):
        features = pd.DataFrame({"product_id": candidates})
        features["product_reorder_rate"] = features["product_id"].map(
            self.products_agg.set_index("product_id")["product_reorder_rate"]
        )
        # Добавьте больше фичей, если нужно
        return features

In [None]:
rec_model = RecommendationModel(orders_df, order_products_df)

In [43]:
rec_model.predict(1)

[2142, 14609, 17762, 20598, 26093, 38928, 3497, 17996, 10236, 35604]