In [28]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
data_path = "../data/"

orders = pd.read_csv(data_path + "orders.csv")
products = pd.read_csv(data_path + "products.csv")
order_products_prior = pd.read_csv(data_path + "order_products__prior.csv")

In [56]:
prior_orders = orders[orders["eval_set"] == "prior"]

In [111]:
user_id = 1

prior_user_orders = prior_orders[orders["user_id"] == user_id]
last_prior_user_order_id = prior_user_orders.loc[prior_user_orders["order_number"].idxmax(), "order_id"]
last_order_products = order_products_prior[order_products_prior["order_id"] == last_prior_user_order_id]["product_id"]
last_order_products

  prior_user_orders = prior_orders[orders["user_id"] == user_id]


24181266      196
24181267    46149
24181268    39657
24181269    38928
24181270    25133
24181271    10258
24181272    35951
24181273    13032
24181274    12427
Name: product_id, dtype: int64

In [31]:
# Пример агрегации данных пользователей и продуктов
users = orders.groupby("user_id").agg({
    "order_id": "count",
    "days_since_prior_order": "mean"
}).rename(columns={
    "order_id": "user_total_orders",
    "days_since_prior_order": "user_avg_days_between_orders"
})

products_agg = order_products_prior.groupby("product_id").agg({
    "reordered": "mean",
    "order_id": "count"
}).rename(columns={
    "reordered": "product_reorder_rate",
    "order_id": "product_total_orders"
})

users.reset_index(inplace=True)
products_agg.reset_index(inplace=True)

In [37]:
users

Unnamed: 0,user_id,user_total_orders,user_avg_days_between_orders
0,1,11,19.000000
1,2,15,16.285714
2,3,13,12.000000
3,4,6,17.000000
4,5,5,11.500000
...,...,...,...
206204,206205,4,16.666667
206205,206206,68,3.716418
206206,206207,17,14.312500
206207,206208,50,7.367347


In [None]:
products_agg[products_agg["product_reorder_rate"] > 0.9]["product_id"]

Unnamed: 0,product_id,product_reorder_rate,product_total_orders
2074,2075,0.931034,87
6430,6433,0.941176,68
13871,13875,0.911111,45
14605,14609,0.914286,35
27734,27740,0.920792,101
39984,39992,0.909091,22
43545,43553,0.923077,13


In [None]:
products_agg.sort_values(by="product_reorder_rate", ascending=False)[:10]

Unnamed: 0,product_id,product_reorder_rate,product_total_orders
6430,6433,0.941176,68
2074,2075,0.931034,87
43545,43553,0.923077,13
27734,27740,0.920792,101
14605,14609,0.914286,35
13871,13875,0.911111,45
39984,39992,0.909091,22
5865,5868,0.9,30
35598,35604,0.9,100
31412,31418,0.9,60


In [35]:
test_orders = orders[orders["eval_set"] == "test"]

In [48]:
test_orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0
...,...,...,...,...,...,...,...
3420918,2728930,206202,test,23,2,17,6.0
3420929,350108,206204,test,5,4,14,14.0
3421001,1043943,206206,test,68,0,20,0.0
3421018,2821651,206207,test,17,2,13,14.0


In [112]:
class ReorderRateClassificationModel():
    def __init__(self):
        self.data_path = "../data/"

        self.orders = self._get_orders()
        self.order_products = self._get_order_products()

    def _get_orders(self) -> pd.DataFrame:
        return pd.read_csv(self.data_path + "orders.csv") # меняется на запрос

    def _get_order_products(self) -> pd.DataFrame:
        return pd.read_csv(data_path + "order_products__prior.csv") # меняется на запрос

    def _get_last_order_products(self, user_id: int) -> pd.Series:
        prior_orders = self.orders[orders["eval_set"] == "prior"]
        prior_user_orders = prior_orders[orders["user_id"] == user_id]
        last_prior_user_order_id = prior_user_orders.loc[prior_user_orders["order_number"].idxmax(), "order_id"]
        last_order_products = order_products_prior[order_products_prior["order_id"] == last_prior_user_order_id]["product_id"]
        return last_order_products

    def _get_top_reordered_items(self, n: int = 10) -> pd.Series:
        products_agg = self.order_products.groupby("product_id").agg({
            "reordered": "mean",
        }).rename(columns={
            "reordered": "product_reorder_rate",
        })
        products_agg.reset_index(inplace=True)


        top_reordered_items = products_agg.sort_values(by="product_reorder_rate", ascending=False)[:n]
        return top_reordered_items


    def _get_features(self, user_id: int, product_id: int):
        df_features = pd.DataFrame()

        last_order_products = self._get_last_order_products(user_id)
        print(last_order_products)
        df_features["is_in_last_order"] = [(product_id in last_order_products)]

        top_reordered_items = self._get_top_reordered_items()
        print(top_reordered_items)
        df_features["is_in_top_reordered"] = [(product_id in top_reordered_items)]

        return df_features

        
    def predict(self, user_id: int, product_id: int):
        df_features = self._get_features(user_id, product_id)

        if df_features["is_in_last_order"][0]:
            return 1
        
        if df_features["is_in_top_reordered"][0]:
            return 1
        
        return 0

In [113]:
clf_model = ReorderRateClassificationModel()

In [114]:
clf_model.predict(1, 196)

  prior_user_orders = prior_orders[orders["user_id"] == user_id]


24181266      196
24181267    46149
24181268    39657
24181269    38928
24181270    25133
24181271    10258
24181272    35951
24181273    13032
24181274    12427
Name: product_id, dtype: int64
       product_id  product_reorder_rate
6430         6433              0.941176
2074         2075              0.931034
43545       43553              0.923077
27734       27740              0.920792
14605       14609              0.914286
13871       13875              0.911111
39984       39992              0.909091
5865         5868              0.900000
35598       35604              0.900000
31412       31418              0.900000


0

In [98]:
df = pd.DataFrame()
df["first"] = [(1 in [1, 2, 3])]

In [None]:
df["second"] = [(0 in [])]

In [106]:
df["first"][0] == True

np.True_