In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm

### v1

In [None]:
class ReorderRateClassificationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.top_reordered_items_n = top_reodered_items_n

        self._get_orders(orders_df)
        self._get_order_products(order_products_df)

        self.user_id = user_id
        self._get_user_features(self.user_id)

        self._get_products_features()

    def _get_orders(self, orders_df: pd.DataFrame):
        self.orders = orders_df  # меняется на запрос

    def _get_order_products(self, order_products_df: pd.DataFrame):
        self.orders_products = order_products_df  # меняется на запрос

    def _get_last_order_products(self, user_id: int) -> pd.Series:
        prior_orders = self.orders[self.orders["eval_set"] == "prior"]
        prior_user_orders = prior_orders[self.orders["user_id"] == user_id]

        if prior_user_orders.empty:
            return pd.Series([], dtype=int)

        last_prior_user_order_id = prior_user_orders.loc[
            prior_user_orders["order_number"].idxmax(), "order_id"
        ]
        last_order_products = self.orders_products[
            self.orders_products["order_id"] == last_prior_user_order_id
        ]["product_id"]

        return last_order_products

    def _get_top_reordered_items(self) -> pd.Series:
        products_agg = (
            self.orders_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        products_agg.reset_index(inplace=True)

        top_reordered_items = products_agg.sort_values(
            by="product_reorder_rate", ascending=False
        )[: self.top_reordered_items_n]

        return top_reordered_items["product_id"]

    def _get_products_features(self):
        self.df_products_features = pd.DataFrame()

        top_reordered_items = self._get_top_reordered_items().values
        self.df_products_features["top_reordered"] = top_reordered_items

    def _get_user_features(self, user_id: int):
        self.df_user_features = pd.DataFrame()

        last_order_products = self._get_last_order_products(user_id).values
        self.df_user_features["last_user_order"] = last_order_products

    def predict(self, product_id: int, user_id: int = None):
        # Меняется на
        #     features = self.get_features(user_id. product_id)
        #     return lgb_model.predict(features)

        if user_id is not None:
            self.user_id = user_id
            self._get_user_features(self.user_id)

        if product_id in self.df_user_features["last_user_order"].values:
            return True

        if product_id in self.df_products_features["top_reordered"].values:
            return True

        return False

In [None]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_df = pd.read_csv(data_path + "order_products__prior.csv")

In [None]:
clf_model = ReorderRateClassificationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

In [None]:
clf_model.predict(6433)

In [None]:
class ReorderRateRecommendationModel:
    def __init__(
        self,
        user_id=int,
        orders_df=pd.DataFrame,
        order_products_df=pd.DataFrame,
        top_reodered_items_n: int = 10,
    ):
        self.user_id = user_id

        self.clf_model = ReorderRateClassificationModel(
            user_id, orders_df, order_products_df, top_reodered_items_n
        )

    def predict(self):
        recommendations_list = []
        products_id = self.clf_model.orders["order_id"].values
        for product_id in tqdm(products_id):
            pred = self.clf_model.predict(product_id)
            if pred:
                recommendations_list.append(product_id)

        return np.random.choice(
            recommendations_list, size=clf_model.top_reordered_items_n, replace=False
        )

In [None]:
rec_model = ReorderRateRecommendationModel(
    user_id=1, orders_df=orders_df, order_products_df=order_products_df
)

In [None]:
rec_model.predict()


### v2

In [None]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_prior_df = pd.read_csv(data_path + "order_products__prior.csv")
order_products_train_df = pd.read_csv(data_path + "order_products__train.csv")

In [None]:
print("add order info to priors")
orders_df.set_index("order_id", inplace=True, drop=False)
order_products_prior_df["user_id"] = order_products_prior_df.join(
    orders_df, on="order_id", rsuffix="_"
)["user_id"]

add order info to priors


In [4]:
order_products_prior_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
0,2,33120,1,1,202279
1,2,28985,2,1,202279
2,2,9327,3,0,202279
3,2,45918,4,1,202279
4,2,30035,5,0,202279
...,...,...,...,...,...
32434484,3421083,39678,6,1,25247
32434485,3421083,11352,7,0,25247
32434486,3421083,4600,8,0,25247
32434487,3421083,24852,9,1,25247


In [None]:
users_prior_products_df = pd.DataFrame()
users_prior_products_df["all_products"] = order_products_prior_df.groupby("user_id")[
    "product_id"
].apply(set)

In [6]:
users_prior_products_df.head()

Unnamed: 0_level_0,all_products
user_id,Unnamed: 1_level_1
1,"{17122, 196, 26405, 46149, 14084, 13032, 26088..."
2,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
3,"{17668, 44683, 48523, 21903, 14992, 21137, 324..."
4,"{21573, 42329, 17769, 35469, 37646, 1200, 1905..."
5,"{11777, 40706, 28289, 48775, 20754, 6808, 1398..."


In [9]:
train_orders_df = orders_df[orders_df["eval_set"] == "train"]
train_orders_df.head()

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1187899,1187899,1,train,11,4,8,14.0
1492625,1492625,2,train,15,1,11,30.0
2196797,2196797,5,train,5,0,11,6.0
525192,525192,7,train,21,2,11,6.0
880375,880375,8,train,4,1,14,10.0


In [None]:
train_orders_df.set_index(["order_id", "product_id"], inplace=True, drop=False)

In [None]:
train_orders_df.head()

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1187899,1187899,1,train,11,4,8,14.0
1492625,1492625,2,train,15,1,11,30.0
2196797,2196797,5,train,5,0,11,6.0
525192,525192,7,train,21,2,11,6.0
880375,880375,8,train,4,1,14,10.0


In [None]:
def get_all_pairs(selected_orders, is_train: bool = False):
    order_list = []
    product_list = []
    labels = []

    for row in tqdm(selected_orders.itertuples()):
        order_id = row.order_id
        user_id = row.user_id
        user_products = users_prior_products_df.loc[user_id, "all_products"]
        product_list += user_products
        order_list += [order_id] * len(user_products)

        if is_train:
            labels += [
                (order_id, product) in train_orders_df.index
                for product in user_products
            ]

    pairs = pd.DataFrame(
        {"order_id": order_list, "product_id": product_list}, dtype=np.int32
    )

    pairs["user_id"] = pairs.order_id.map(selected_orders.user_id)
    pairs.drop(columns=["order_id"], inplace=True)

    return pairs, labels

In [19]:
pairs = get_all_pairs(train_orders, True)

0it [00:00, ?it/s]

4it [00:45, 11.47s/it]


KeyboardInterrupt: 

In [None]:
pairs

In [None]:
def get_features(pairs):
    products_agg = (
        order_products_prior_df.groupby("product_id")
        .agg(
            {
                "reordered": "mean",
            }
        )
        .rename(
            columns={
                "reordered": "product_reorder_rate",
            }
        )
    )
    products_agg.reset_index(inplace=True)

    pairs["product_reorder_rate"] = pairs["product_id"].map(
        products_agg.set_index("product_id")["product_reorder_rate"]
    )

    return pairs


In [None]:
features = get_features(pairs)

In [None]:
features[features["product_id"] == 196]

In [None]:
class RecommendationModel:
    def __init__(
        self,
        orders_df: pd.DataFrame,
        order_products_df: pd.DataFrame,
        top_k_products: int = 100,
        recommendations_n: int = 10,
    ):
        self.orders = orders_df
        self.order_products = order_products_df
        self.top_k_products = top_k_products
        self.recommendations_n = recommendations_n

        self.products_agg = (
            self.order_products.groupby("product_id")
            .agg(
                {
                    "reordered": "mean",
                }
            )
            .rename(
                columns={
                    "reordered": "product_reorder_rate",
                }
            )
        )
        self.products_agg.reset_index(inplace=True)

        self.top_reordered_products = (
            self.products_agg.sort_values(by="product_reorder_rate", ascending=False)
            .head(top_k_products)["product_id"]
            .tolist()
        )

    def _get_user_candidates(self, user_id):
        # Получаем последние заказы пользователя
        user_orders = self.orders[self.orders["user_id"] == user_id]
        prior_orders = user_orders[user_orders["eval_set"] == "prior"]

        if prior_orders.empty:
            return self.top_reordered_products

        # Товары из последних заказов пользователя
        last_order_products = self.order_products[
            self.order_products["order_id"].isin(prior_orders["order_id"])
        ]["product_id"].tolist()

        # Комбинируем с топ-товарами
        candidates = list(set(last_order_products + self.top_reordered_products))
        return candidates

    def predict(self, user_id):
        # Получаем список кандидатов для пользователя
        candidates = self._get_user_candidates(user_id)

        # Составляем DataFrame с фичами для кандидатов
        features = self._generate_features(user_id, candidates)

        # Замените self.model.predict(features) на вашу обученную ML-модель
        predictions = np.random.rand(len(features))  # пример случайного предсказания

        # Ранжируем по предсказанным вероятностям и возвращаем топ-N
        top_indices = predictions.argsort()[-self.recommendations_n :][::-1]
        recommended_products = [candidates[i] for i in top_indices]
        return recommended_products

    def _generate_features(self, user_id, candidates):
        # Пример генерации фичей (замените на свои):
        features = pd.DataFrame({"product_id": candidates})
        features["product_reorder_rate"] = features["product_id"].map(
            self.products_agg.set_index("product_id")["product_reorder_rate"]
        )
        # Добавьте больше фичей, если нужно
        return features

In [None]:
rec_model = RecommendationModel(orders_df, order_products_df)

In [None]:
rec_model.predict(1)

### v3

In [2]:
data_path = "../data/"

orders_df = pd.read_csv(data_path + "orders.csv")
order_products_prior_df = pd.read_csv(data_path + "order_products__prior.csv")
order_products_train_df = pd.read_csv(data_path + "order_products__train.csv")

In [91]:
orders_df[orders_df["user_id"] == 1]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [96]:
order_products_train_df[order_products_train_df["order_id"] == 1187899]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
484420,1187899,196,1,1
484421,1187899,25133,2,1
484422,1187899,38928,3,1
484423,1187899,26405,4,1
484424,1187899,39657,5,1
484425,1187899,10258,6,1
484426,1187899,13032,7,1
484427,1187899,26088,8,1
484428,1187899,27845,9,0
484429,1187899,49235,10,1


In [92]:
train_data[train_data["user_id"] == 1]

Unnamed: 0,user_product_reorder_rate,user_id,product_id,reordered
0,0.9,1,196,1
1,0.875,1,25133,1
2,0.0,1,38928,1
3,0.5,1,26405,1
4,0.0,1,39657,1
5,0.888889,1,10258,1
6,0.666667,1,13032,1
7,0.5,1,26088,1
8,,1,27845,0
9,0.5,1,49235,1


In [94]:
orders_df[orders_df["user_id"] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26,1374495,3,prior,1,1,14,
27,444309,3,prior,2,3,19,9.0
28,3002854,3,prior,3,3,16,21.0
29,2037211,3,prior,4,2,18,20.0
30,2710558,3,prior,5,0,17,12.0
31,1972919,3,prior,6,0,16,7.0
32,1839752,3,prior,7,0,15,7.0
33,3225766,3,prior,8,0,17,7.0
34,3160850,3,prior,9,0,16,7.0
35,676467,3,prior,10,3,16,17.0


In [95]:
test_data[test_data["user_id"] == 3]

Unnamed: 0,user_product_reorder_rate,user_id,product_id
0,0.0,3,38596
1,0.875,3,21903
2,0.0,3,248
3,0.0,3,40604
4,0.0,3,8021
5,0.8,3,17668
6,0.0,3,21137
7,0.5,3,23650
8,0.666667,3,32402
9,0.9,3,39190


In [3]:
order_products_prior_df = order_products_prior_df.merge(
    orders_df[["order_id", "user_id"]], on="order_id", how="left"
)

In [None]:
# Создание базового DataFrame для признаков
def generate_features(order_products_prior):
    print("Генерация признаков...")

    # Признаки для каждой пары пользователь-товар
    user_product_features = order_products_prior.groupby(["user_id", "product_id"]).agg(
        user_product_orders=("order_id", "count"),
        user_product_reorders=("reordered", "sum"),
    )
    user_product_features["user_product_reorder_rate"] = (
        user_product_features["user_product_reorders"]
        / user_product_features["user_product_orders"]
    )

    # Возврат всех признаков
    return user_product_features


In [5]:
user_product_features = generate_features(order_products_prior_df)

Генерация признаков...


In [6]:
user_product_features

Unnamed: 0_level_0,Unnamed: 1_level_0,user_product_orders,user_product_reorders,user_product_reorder_rate
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,196,10,9,0.900000
1,10258,9,8,0.888889
1,10326,1,0,0.000000
1,12427,10,9,0.900000
1,13032,3,2,0.666667
...,...,...,...,...
206209,43961,3,2,0.666667
206209,44325,1,0,0.000000
206209,48370,1,0,0.000000
206209,48697,1,0,0.000000


In [None]:
def prepare_train_test_data(
    orders, order_products_prior, order_products_train, user_product_features
):
    print("Подготовка тренировочных и тестовых данных...")

    # Тренировочные данные
    train_orders = orders[orders["eval_set"] == "train"]
    train_data = train_orders.merge(order_products_train, on="order_id", how="left")
    train_data = train_data.merge(
        user_product_features, on=["user_id", "product_id"], how="left"
    )
    train_data["reordered"] = train_data["reordered"].fillna(0).astype(int)

    # Формирование тестовых данных
    test_orders = orders[orders["eval_set"] == "test"]
    test_data = test_orders[["order_id", "user_id"]].merge(
        order_products_prior[["user_id", "product_id"]].drop_duplicates(),
        on="user_id",
        how="left",
    )
    test_data = test_data.merge(
        orders[
            [
                "order_id",
                "days_since_prior_order",
                "order_dow",
                "order_hour_of_day",
                "order_number",
            ]
        ],
        on="order_id",
        how="left",
    )
    test_data = test_data.merge(
        user_product_features, on=["user_id", "product_id"], how="left"
    )

    features = [
        # "days_since_prior_order",
        # "order_dow",
        # "order_hour_of_day",
        # "order_number",
        # "user_product_orders",
        # "user_product_reorders",
        "user_product_reorder_rate",
        "user_id",
        "product_id",
    ]

    return train_data[features + ["reordered"]], test_data[features]


In [68]:
train_data, test_data = prepare_train_test_data(
    orders_df, order_products_prior_df, order_products_train_df, user_product_features
)

Подготовка тренировочных и тестовых данных...


In [69]:
train_data.columns

Index(['user_product_reorder_rate', 'user_id', 'product_id', 'reordered'], dtype='object')

In [70]:
test_data.columns

Index(['user_product_reorder_rate', 'user_id', 'product_id'], dtype='object')

In [55]:
set(train_data.columns) - set(test_data.columns)

{'reordered'}

In [90]:
test_data[test_data["user_id"] == 14]

Unnamed: 0,user_product_reorder_rate,user_id,product_id


In [98]:
def prepare_train_test_data_fixed(orders, order_products_train, order_products_prior):
    """
    Генерация тренировочных и тестовых данных с корректным таргетом
    """

    # Объединяем prior с информацией о заказах
    orders.set_index("order_id", inplace=True, drop=False)
    order_products_prior = order_products_prior.join(
        orders[["user_id", "order_number", "eval_set"]], on="order_id"
    )

    # Генерируем кандидатов для train
    train_orders = orders[orders["eval_set"] == "train"]
    train_candidates = train_orders[["order_id", "user_id"]].merge(
        order_products_prior[["user_id", "product_id"]], on="user_id"
    )

    # Отмечаем, какие продукты реально были заказаны
    train_candidates = train_candidates.merge(
        order_products_train[["order_id", "product_id"]],
        on=["order_id", "product_id"],
        how="left",
        indicator=True,
    )

    # Целевой столбец: 1 = продукт заказан, 0 = продукт не заказан
    train_candidates["target"] = (train_candidates["_merge"] == "both").astype(int)
    train_candidates.drop(columns=["_merge"], inplace=True)

    # Формируем тестовый набор данных
    test_orders = orders[orders["eval_set"] == "test"]
    test_candidates = test_orders[["order_id", "user_id"]].merge(
        order_products_prior[["user_id", "product_id"]], on="user_id"
    )

    return train_candidates, test_candidates


In [99]:
train_data, test_data = prepare_train_test_data_fixed(
    orders_df, order_products_prior_df, order_products_train_df
)

In [102]:
train_data[train_data["user_id"] == 1]

Unnamed: 0,order_id,user_id,product_id,target
0,1187899,1,196,0
1,1187899,1,25133,0
2,1187899,1,38928,0
3,1187899,1,26405,0
4,1187899,1,39657,0
5,1187899,1,10258,0
6,1187899,1,13032,0
7,1187899,1,26088,0
8,1187899,1,27845,0
9,1187899,1,49235,0


In [101]:
test_data

Unnamed: 0,order_id,user_id,product_id


In [58]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [59]:
# Разделение на обучение и валидацию
X = train_data.drop(columns=["reordered"])
y = train_data["reordered"]

In [60]:
X.head()

Unnamed: 0,user_product_reorder_rate
0,0.9
1,0.875
2,0.0
3,0.5
4,0.0


In [63]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: reordered, dtype: int64

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание модели LightGBM
model = lgb.LGBMClassifier()

# Обучение модели
model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
)

# Предсказания на валидации
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Метрики
accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

[LightGBM] [Info] Number of positive: 662730, number of negative: 444963
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 1107693, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.598298 -> initscore=0.398377
[LightGBM] [Info] Start training from score 0.398377
Accuracy: 1.0000
ROC AUC: 1.0000


In [64]:
# Предсказания на валидации
y_pred = model.predict(X_train)
y_pred_proba = model.predict_proba(X_train)[:, 1]

# Метрики
accuracy = accuracy_score(y_train, y_pred)
roc_auc = roc_auc_score(y_train, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 1.0000
ROC AUC: 1.0000


In [49]:
y_pred, y_val

(array([0, 1, 1, ..., 0, 0, 1]),
 619147     0
 1208847    1
 42965      1
 760387     0
 416646     0
           ..
 73634      1
 500821     1
 487725     0
 318576     0
 404971     1
 Name: reordered, Length: 276924, dtype: int64)