In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd

IDIR = "../data/"

### Загрузка данных

In [None]:
print("loading prior")
priors = pd.read_csv(
    IDIR + "order_products__prior.csv",
    dtype={
        "order_id": np.int32,
        "product_id": np.uint16,
        "add_to_cart_order": np.int16,
        "reordered": np.int8,
    },
)

print("loading train")
train = pd.read_csv(
    IDIR + "order_products__train.csv",
    dtype={
        "order_id": np.int32,
        "product_id": np.uint16,
        "add_to_cart_order": np.int16,
        "reordered": np.int8,
    },
)

print("loading orders")
orders = pd.read_csv(
    IDIR + "orders.csv",
    dtype={
        "order_id": np.int32,
        "user_id": np.int32,
        "eval_set": "category",
        "order_number": np.int16,
        "order_dow": np.int8,
        "order_hour_of_day": np.int8,
        "days_since_prior_order": np.float32,
    },
)

print("loading products")
products = pd.read_csv(
    IDIR + "products.csv",
    dtype={
        "product_id": np.uint16,
        "order_id": np.int32,
        "aisle_id": np.uint8,
        "department_id": np.uint8,
    },
    usecols=["product_id", "aisle_id", "department_id"],
)

print("priors {}: {}".format(priors.shape, ", ".join(priors.columns)))
print("orders {}: {}".format(orders.shape, ", ".join(orders.columns)))
print("train {}: {}".format(train.shape, ", ".join(train.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


### Join в одну таблицу

In [None]:
print("computing product f")
prods = pd.DataFrame()
prods["orders"] = priors.groupby(priors.product_id).size().astype(np.int32)
prods["reorders"] = (
    priors["reordered"].groupby(priors.product_id).sum().astype(np.float32)
)
prods["reorder_rate"] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on="product_id")
products.set_index("product_id", drop=False, inplace=True)
del prods


print("add order info to priors")
orders.set_index("order_id", inplace=True, drop=False)
priors = priors.join(orders, on="order_id", rsuffix="_")
priors.drop("order_id_", inplace=True, axis=1)

computing product f
add order info to priors


In [5]:
### user features

print("computing user f")
usr = pd.DataFrame()
usr["average_days_between_orders"] = (
    orders.groupby("user_id")["days_since_prior_order"].mean().astype(np.float32)
)
usr["nb_orders"] = orders.groupby("user_id").size().astype(np.int16)

users = pd.DataFrame()
users["total_items"] = priors.groupby("user_id").size().astype(np.int16)
users["all_products"] = priors.groupby("user_id")["product_id"].apply(set)
users["total_distinct_items"] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
users["average_basket"] = (users.total_items / users.nb_orders).astype(np.float32)
print("user f", users.shape)

### userXproduct features

print("compute userXproduct f - this is long...")
priors["user_product"] = priors.product_id + priors.user_id * 100000

d = dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, (row.order_number, row.order_id), row.add_to_cart_order)
    else:
        d[z] = (
            d[z][0] + 1,
            max(d[z][1], (row.order_number, row.order_id)),
            d[z][2] + row.add_to_cart_order,
        )

print("to dataframe (less memory)")
userXproduct = pd.DataFrame.from_dict(d, orient="index")
del d
userXproduct.columns = ["nb_orders", "last_order_id", "sum_pos_in_cart"]
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(
    np.int32
)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print("user X product f", len(userXproduct))

del priors


computing user f
user f (206209, 6)
compute userXproduct f - this is long...
to dataframe (less memory)
user X product f 13293564


In [None]:
### train / test orders ###
print("split orders : train, test")
test_orders = orders[orders.eval_set == "test"]
train_orders = orders[orders.eval_set == "train"]

train.set_index(["order_id", "product_id"], inplace=True, drop=False)

### build list of candidate products to reorder, with features ###


def get_features(selected_orders, labels_given=False):
    print("build candidate list")
    order_list = []
    product_list = []
    labels = []
    i = 0
    for row in selected_orders.itertuples():
        i += 1
        if i % 10000 == 0:
            print("order row", i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]

    df = pd.DataFrame(
        {"order_id": order_list, "product_id": product_list}, dtype=np.int32
    )
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list

    print("user related features")
    df["user_id"] = df.order_id.map(orders.user_id)
    df["user_total_orders"] = df.user_id.map(users.nb_orders)
    df["user_total_items"] = df.user_id.map(users.total_items)
    df["total_distinct_items"] = df.user_id.map(users.total_distinct_items)
    df["user_average_days_between_orders"] = df.user_id.map(
        users.average_days_between_orders
    )
    df["user_average_basket"] = df.user_id.map(users.average_basket)

    print("order related features")
    # df['dow'] = df.order_id.map(orders.order_dow)
    df["order_hour_of_day"] = df.order_id.map(orders.order_hour_of_day)
    df["days_since_prior_order"] = df.order_id.map(orders.days_since_prior_order)
    df["days_since_ratio"] = (
        df.days_since_prior_order / df.user_average_days_between_orders
    )

    print("product related features")
    df["aisle_id"] = df.product_id.map(products.aisle_id)
    df["department_id"] = df.product_id.map(products.department_id)
    df["product_orders"] = df.product_id.map(products.orders).astype(np.int32)
    df["product_reorders"] = df.product_id.map(products.reorders)
    df["product_reorder_rate"] = df.product_id.map(products.reorder_rate)

    print("user_X_product related features")
    df["z"] = df.user_id * 100000 + df.product_id
    df.drop(["user_id"], axis=1, inplace=True)
    df["UP_orders"] = df.z.map(userXproduct.nb_orders)
    df["UP_orders_ratio"] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df["UP_last_order_id"] = df.z.map(userXproduct.last_order_id)
    df["UP_average_pos_in_cart"] = (
        df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders
    ).astype(np.float32)
    df["UP_reorder_rate"] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df["UP_orders_since_last"] = df.user_total_orders - df.UP_last_order_id.map(
        orders.order_number
    )
    df["UP_delta_hour_vs_last"] = (
        abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day))
        .map(lambda x: min(x, 24 - x))
        .astype(np.int8)
    )
    # df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(["UP_last_order_id", "z"], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

split orders : train, test


In [None]:
df_train, labels = get_features(train_orders, labels_given=True)

f_to_use = [
    "user_total_orders",
    "user_total_items",
    "total_distinct_items",
    "user_average_days_between_orders",
    "user_average_basket",
    "order_hour_of_day",
    "days_since_prior_order",
    "days_since_ratio",
    "aisle_id",
    "department_id",
    "product_orders",
    "product_reorders",
    "product_reorder_rate",
    "UP_orders",
    "UP_orders_ratio",
    "UP_average_pos_in_cart",
    "UP_reorder_rate",
    "UP_orders_since_last",
    "UP_delta_hour_vs_last",
]  # 'dow', 'UP_same_dow_as_last_order'


print("formating for lgb")
d_train = lgb.Dataset(
    df_train[f_to_use], label=labels, categorical_feature=["aisle_id", "department_id"]
)  # , 'order_hour_of_day', 'dow'
del df_train

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_or

In [8]:
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": {"binary_logloss"},
    "num_leaves": 96,
    "max_depth": 10,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.95,
    "bagging_freq": 5,
}
ROUNDS = 10

print("light GBM train :-)")
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
del d_train

light GBM train :-)
[LightGBM] [Info] Number of positive: 828824, number of negative: 7645837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.313681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3280
[LightGBM] [Info] Number of data points in the train set: 8474661, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097800 -> initscore=-2.221909
[LightGBM] [Info] Start training from score -2.221909


In [None]:
### build candidates list for test ###

df_test, _ = get_features(test_orders)

print("light GBM predict")
preds = bst.predict(df_test[f_to_use])

df_test["pred"] = preds

TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += " " + str(row.product_id)
        except:  # noqa: E722
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = "None"

sub = pd.DataFrame.from_dict(d, orient="index")

sub.reset_index(inplace=True)
sub.columns = ["order_id", "products"]
sub.to_csv("sub.csv", index=False)

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in

In [10]:
bst.save_model('./models/baseline_model.bin')

<lightgbm.basic.Booster at 0x14a98278dd0>

### Testing

In [29]:
import pathlib

model_filename = pathlib.Path("./models/baseline_model.bin")
model = lgb.Booster(model_file=model_filename)

In [31]:
x = df_test.loc[0, f_to_use]
display(x)

user_total_orders                     13.000000
user_total_items                      88.000000
total_distinct_items                  33.000000
user_average_days_between_orders      12.000000
user_average_basket                    6.769231
order_hour_of_day                     15.000000
days_since_prior_order                11.000000
days_since_ratio                       0.916667
aisle_id                              91.000000
department_id                         16.000000
product_orders                      2110.000000
product_reorders                    1220.000000
product_reorder_rate                   0.578199
UP_orders                              5.000000
UP_orders_ratio                        0.384615
UP_average_pos_in_cart                 3.600000
UP_reorder_rate                        0.384615
UP_orders_since_last                   2.000000
UP_delta_hour_vs_last                  3.000000
Name: 0, dtype: float64

In [32]:
model.predict(x)



array([0.2688697])

In [21]:
df_test[f_to_use]

Unnamed: 0,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,13,88,33,12.000000,6.769231,15,11.0,0.916667,91,16,2110,1220.0,0.578199,5,0.384615,3.600000,0.384615,2,3
1,13,88,33,12.000000,6.769231,15,11.0,0.916667,83,4,22275,11981.0,0.537868,2,0.153846,9.500000,0.153846,7,1
2,13,88,33,12.000000,6.769231,15,11.0,0.916667,37,1,5129,2376.0,0.463248,2,0.153846,6.500000,0.153846,4,1
3,13,88,33,12.000000,6.769231,15,11.0,0.916667,123,4,241921,186884.0,0.772500,8,0.615385,4.250000,0.615385,1,0
4,13,88,33,12.000000,6.769231,15,11.0,0.916667,83,4,29069,16942.0,0.582820,2,0.153846,7.000000,0.153846,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833287,50,677,198,7.367347,13.540000,11,4.0,0.542936,3,19,311,124.0,0.398714,1,0.020000,11.000000,0.020000,7,3
4833288,50,677,198,7.367347,13.540000,11,4.0,0.542936,123,4,9240,5420.0,0.586580,7,0.140000,5.571429,0.140000,4,1
4833289,50,677,198,7.367347,13.540000,11,4.0,0.542936,3,19,1393,755.0,0.541996,1,0.020000,12.000000,0.020000,7,3
4833290,50,677,198,7.367347,13.540000,11,4.0,0.542936,72,13,6046,1424.0,0.235528,1,0.020000,9.000000,0.020000,8,8


In [20]:
df_test

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,pred
0,2774568,17668,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,2110,1220.0,0.578199,5,0.384615,3.600000,0.384615,2,3,0.268870
1,2774568,44683,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,22275,11981.0,0.537868,2,0.153846,9.500000,0.153846,7,1,0.080146
2,2774568,48523,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,5129,2376.0,0.463248,2,0.153846,6.500000,0.153846,4,1,0.089660
3,2774568,21903,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,241921,186884.0,0.772500,8,0.615385,4.250000,0.615385,1,0,0.472873
4,2774568,14992,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,29069,16942.0,0.582820,2,0.153846,7.000000,0.153846,6,0,0.084871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833287,803273,44532,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,311,124.0,0.398714,1,0.020000,11.000000,0.020000,7,3,0.049083
4833288,803273,46069,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,9240,5420.0,0.586580,7,0.140000,5.571429,0.140000,4,1,0.126170
4833289,803273,12791,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,1393,755.0,0.541996,1,0.020000,12.000000,0.020000,7,3,0.051341
4833290,803273,14332,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,6046,1424.0,0.235528,1,0.020000,9.000000,0.020000,8,8,0.045858


In [18]:
test_orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2774568,2774568,3,test,13,5,15,11.0
329954,329954,4,test,6,3,12,30.0
1528013,1528013,6,test,4,3,16,22.0
1376945,1376945,11,test,8,6,11,8.0
1356845,1356845,12,test,6,1,20,30.0
...,...,...,...,...,...,...,...
2728930,2728930,206202,test,23,2,17,6.0
350108,350108,206204,test,5,4,14,14.0
1043943,1043943,206206,test,68,0,20,0.0
2821651,2821651,206207,test,17,2,13,14.0


In [19]:
orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2539329,2539329,1,prior,1,2,8,
2398795,2398795,1,prior,2,3,7,15.0
473747,473747,1,prior,3,3,12,21.0
2254736,2254736,1,prior,4,4,7,29.0
431534,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
2266710,2266710,206209,prior,10,5,18,29.0
1854736,1854736,206209,prior,11,4,10,30.0
626363,626363,206209,prior,12,1,12,18.0
2977660,2977660,206209,prior,13,1,12,7.0
