In [1]:
from IPython.display import display, Markdown

import pandas as pd
import gc

# Keras

from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# LabelEncoder

from sklearn.preprocessing import LabelEncoder

# Train Test Split

from sklearn.model_selection import train_test_split

In [2]:
def load_comp():
    # Load the competition data
    comp_data = pd.read_csv("data/competition_data.csv")
    comp_data = comp_data.drop(
        columns=[
            "benefit", 
            "etl_version", 
            "uid",
            "date",
            "deal_print_id",
            "full_name",
            "main_picture",
            "warranty",
            "tags",
            "print_server_timestamp",
            "print_position",
            "title"
        ]
    )
    # OHE comp_data columns
    comp_data = pd.get_dummies(comp_data, 
        sparse=True,
        columns=[
            #"category_id", 
            #"domain_id", 
            "logistic_type", 
            "platform", 
            "site_id"
        ],
        dtype=int
    )
    comp_data["accepts_mercadopago"] = comp_data["accepts_mercadopago"].astype(int)
    comp_data["boosted"] = comp_data["boosted"].astype(int)
    comp_data["free_shipping"] = comp_data["free_shipping"].astype(int)
    comp_data["fulfillment"] = comp_data["fulfillment"].astype(int)

    #comp_data["is_pdp"] = comp_data["is_pdp"].astype(int)
    #comp_data["warranty"] = comp_data["warranty"].astype(int)

    # comp_data["listing_type_id"] to 0 if gold_special, 1 if gold_pro.
    comp_data["listing_type_id"] = comp_data["listing_type_id"].apply(lambda x: 0 if x == "gold_special" else 1)
    
    # Label encode category_id and domain_id
    # comp_data["category_id"] = comp_data["category_id"].astype("category")#.cat.codes
    #comp_data["domain_id"] = comp_data["domain_id"].astype("category")#.cat.codes

    # sklearn LabelEncoder for category_id and domain_id
    le = LabelEncoder()
    comp_data["category_id"] = le.fit_transform(comp_data["category_id"])
    comp_data["domain_id"] = le.fit_transform(comp_data["domain_id"])

    comp_data["price_diff"] = comp_data["price"] - comp_data["original_price"]
    comp_data["cheaper_than_original"] = comp_data["price_diff"].apply(lambda x: 1 if x < 0 else 0)
    comp_data["price_diff"] = comp_data["price_diff"].apply(lambda x: abs(x)).astype(int)

    #comp_data["cheaper_than_avg"] = comp_data["price"] - comp_data["avg_asp_item_domain"]
    #comp_data["cheaper_than_avg"] = comp_data["cheaper_than_avg"].apply(lambda x: 1 if x < 0 else 0).astype(int)

    # Drop useless columns
    # comp_data = comp_data.drop(
    #     columns=[
    #     ]
    # )

    return comp_data

# Split into training and evaluation samples
comp_data = load_comp()

display(
    Markdown(
        "\n".join(
            [
                "- **{}** ({}) \n\n {}".format(col, dtype, comp_data[col].value_counts()) for col, dtype in
                zip(comp_data.columns, comp_data.dtypes)
            ]
        )
    )
)

full_data = comp_data[comp_data["ROW_ID"].isna()]
eval_data = comp_data[comp_data["ROW_ID"].notna()]
del comp_data
gc.collect()

train_data, test_data = train_test_split(full_data, test_size=0.20, train_size=0.80, random_state=42)

y_train = train_data["conversion"]
X_train = train_data.drop(columns=["conversion", "ROW_ID"])
X_train = X_train.select_dtypes(include='number')

y_test = test_data["conversion"]
X_test = test_data.drop(columns=["conversion", "ROW_ID"])
X_test = X_test.select_dtypes(include='number')

del train_data
del test_data

gc.collect()

- **accepts_mercadopago** (int64) 

 1    199972
Name: accepts_mercadopago, dtype: int64
- **available_quantity** (int64) 

 1        5041
2        4272
3        3744
4        3657
5        3111
         ... 
3145        1
7041        1
19730       1
7929        1
37905       1
Name: available_quantity, Length: 6937, dtype: int64
- **avg_gmv_item_domain_30days** (float64) 

 144.419645    1332
144.390746    1192
169.234106     994
151.463589     991
145.804494     985
              ... 
401.363777       1
136.372464       1
40.951290        1
22.593832        1
174.079162       1
Name: avg_gmv_item_domain_30days, Length: 17727, dtype: int64
- **avg_gmv_item_sel** (float64) 

 228.776307      578
10406.540727    540
213.130612      489
182.490736      474
214.441442      474
               ... 
164.803436        1
52.976757         1
60.970952         1
125.980833        1
23.151458         1
Name: avg_gmv_item_sel, Length: 18652, dtype: int64
- **avg_gmv_seller_bday** (float64) 

 16479.520000    578
19078.658000    540
14869.412333    489
12074.803667    474
15068.085333    474
               ... 
7546.452333       1
167.065333        1
424.793000        1
831.559667        1
37.042333         1
Name: avg_gmv_seller_bday, Length: 18634, dtype: int64
- **avg_qty_orders_item_domain_30days** (float64) 

 2.927004    1332
2.919493    1192
3.505991     994
2.988779     991
2.917104     985
            ... 
2.910275       1
1.456693       1
2.448889       1
3.390135       1
5.377246       1
Name: avg_qty_orders_item_domain_30days, Length: 16760, dtype: int64
- **avg_qty_orders_item_sel_30days** (float64) 

 4.282277      578
258.363636    540
4.302914      489
3.109320      474
4.266129      474
             ... 
2.147826        1
1.765217        1
21.280488       1
3.675926        1
1.676829        1
Name: avg_qty_orders_item_sel_30days, Length: 15129, dtype: int64
- **avg_si_item_sel_30day** (float64) 

 4.342897      578
288.854545    540
4.413760      489
3.153149      474
4.372865      474
             ... 
14.172414       1
5.057971        1
5.568465        1
2.041667        1
2.104167        1
Name: avg_si_item_sel_30day, Length: 15527, dtype: int64
- **boosted** (int64) 

 0    199972
Name: boosted, dtype: int64
- **category_id** (int64) 

 62      12333
49       8641
611      5870
1        5446
518      4154
        ...  
1105        1
1634        1
794         1
1395        1
2257        1
Name: category_id, Length: 2284, dtype: int64
- **conversion** (float64) 

 0.0    164017
1.0     16744
Name: conversion, dtype: int64
- **domain_id** (int64) 

 1380    18203
299      8641
1512     5446
776      4966
742      3151
        ...  
440         1
1138        1
1063        1
448         1
506         1
Name: domain_id, Length: 1742, dtype: int64
- **free_shipping** (int64) 

 0    120063
1     79909
Name: free_shipping, dtype: int64
- **fulfillment** (int64) 

 0    150890
1     49082
Name: fulfillment, dtype: int64
- **health** (float64) 

 0.90    29879
0.80    26008
0.81    22010
0.77    18326
0.88    17643
1.00    16830
0.72    10282
0.83     9896
0.70     7913
0.87     7358
0.91     6497
0.75     6177
0.66     3848
0.00     3819
0.63     3503
0.85     2767
0.60     1344
0.54      920
0.84      790
0.76      754
0.58      697
0.50      660
0.92      444
0.71      430
0.57      415
0.55      249
0.62      206
0.69      148
0.45       69
0.42       54
0.61       21
0.44        7
0.40        4
0.36        2
0.53        2
Name: health, dtype: int64
- **is_pdp** (object) 

 False    109136
True        652
Name: is_pdp, dtype: int64
- **product_id** (float64) 

 15190794.0    662
15315495.0    662
15582422.0    629
8730036.0     516
14542731.0    468
             ... 
13238064.0      1
8988820.0       1
15272931.0      1
13255928.0      1
12308454.0      1
Name: product_id, Length: 4440, dtype: int64
- **item_id** (object) 

 MLA784334044    717
MLA764888251    701
MLA847531773    622
MLA785891221    535
MLA766402693    517
               ... 
MLA833936427      1
MLA663926326      1
MLA784136287      1
MLA812482695      1
MLA762298064      1
Name: item_id, Length: 27695, dtype: int64
- **listing_type_id** (int64) 

 0    165219
1     34753
Name: listing_type_id, dtype: int64
- **offset** (int64) 

 0       69008
48      28743
96      18677
144     13473
192     10561
        ...  
6288        1
7248        1
7008        1
7680        1
7152        1
Name: offset, Length: 209, dtype: int64
- **original_price** (int64) 

 1999     3280
999      2250
2999     1979
699      1760
3999     1670
         ... 
6052        1
7381        1
3881        1
2151        1
14374       1
Name: original_price, Length: 6469, dtype: int64
- **price** (int64) 

 2999     1781
1999     1741
1599     1688
3299     1645
999      1573
         ... 
3843        1
2268        1
54990       1
9668        1
21237       1
Name: price, Length: 6029, dtype: int64
- **qty_items_dom** (float64) 

 20097.0    1332
20495.0    1192
24036.0     994
23081.0     991
21364.0     985
           ... 
4538.0        1
2869.0        1
2849.0        1
6254.0        1
2760.0        1
Name: qty_items_dom, Length: 4193, dtype: int64
- **qty_items_sel** (float64) 

 50.0      1904
51.0      1427
55.0      1108
73.0      1083
53.0       994
          ... 
678.0        1
1138.0       1
1607.0       1
1195.0       1
2284.0       1
Name: qty_items_sel, Length: 1462, dtype: int64
- **sold_quantity** (int64) 

 1        2892
2        2721
3        2684
0        2661
4        2445
         ... 
4065        1
2718        1
7318        1
13288       1
3010        1
Name: sold_quantity, Length: 5718, dtype: int64
- **total_asp_item_domain_30days** (float64) 

 55.778233    1332
55.962392    1192
54.675241     994
56.862608     991
56.362453     985
             ... 
11.691916       1
16.234278       1
17.845285       1
33.731286       1
30.710993       1
Name: total_asp_item_domain_30days, Length: 17729, dtype: int64
- **total_asp_item_sel_30days** (float64) 

 52.675575    578
38.512522    540
51.464741    489
53.133151    474
51.914352    474
            ... 
6.484324       1
14.827999      1
28.673431      1
50.753049      1
10.497735      1
Name: total_asp_item_sel_30days, Length: 18651, dtype: int64
- **total_gmv_domain_bday** (float64) 

 96746.720333     1332
98642.944667     1192
135590.366000     994
116531.036667     991
103832.240000     985
                 ... 
17499.460667        1
1900.123000         1
169.265333          1
206.357000          1
969.040667          1
Name: total_gmv_domain_bday, Length: 17717, dtype: int64
- **total_gmv_item_30days** (float64) 

 56749.36    207
205.13      191
7464.25     176
16430.75    172
3923.66     159
           ... 
224.80        1
642.31        1
1251.60       1
112.82        1
86.74         1
Name: total_gmv_item_30days, Length: 45132, dtype: int64
- **total_items_domain** (int64) 

 149989    1401
149263    1194
166723     994
141335     993
147132     985
          ... 
4090         1
2201         1
23088        1
5779         1
2675         1
Name: total_items_domain, Length: 10309, dtype: int64
- **total_items_seller** (int64) 

 68       1340
86        783
84        777
69        698
11678     578
         ... 
1895        1
2464        1
1757        1
1678        1
3692        1
Name: total_items_seller, Length: 3240, dtype: int64
- **total_orders_domain_30days** (float64) 

 58824.0    1332
59835.0    1192
84270.0     994
68984.0     991
62321.0     985
           ... 
21225.0       1
9182.0        1
7520.0        1
7659.0        1
4857.0        1
Name: total_orders_domain_30days, Length: 7882, dtype: int64
- **total_orders_item_30days** (float64) 

 1.0      11380
3.0       9192
2.0       9043
4.0       7991
5.0       6995
         ...  
539.0        1
669.0        1
328.0        1
642.0        1
384.0        1
Name: total_orders_item_30days, Length: 635, dtype: int64
- **total_orders_sel_30days** (float64) 

 9254.0     578
14210.0    540
9006.0     489
6172.0     474
8993.0     474
          ... 
6800.0       1
3885.0       1
3113.0       1
856.0        1
3689.0       1
Name: total_orders_sel_30days, Length: 4413, dtype: int64
- **total_si_domain_30days** (float64) 

 59860.0    1332
60927.0    1192
85853.0     994
70300.0     991
63507.0     985
           ... 
8811.0        1
8356.0        1
3205.0        1
3487.0        1
14509.0       1
Name: total_si_domain_30days, Length: 9157, dtype: int64
- **total_si_item_30days** (float64) 

 1.0       10324
2.0        8033
3.0        7747
4.0        6878
5.0        6315
          ...  
672.0         1
1129.0        1
608.0         1
700.0         1
971.0         1
Name: total_si_item_30days, Length: 850, dtype: int64
- **total_si_sel_30days** (float64) 

 9385.0     578
15887.0    540
9238.0     489
9218.0     474
6259.0     474
          ... 
3708.0       1
4848.0       1
13328.0      1
4714.0       1
3928.0       1
Name: total_si_sel_30days, Length: 5301, dtype: int64
- **total_visits_domain** (int64) 

 9324517     1401
9042945     1194
13032888     994
7310127      993
8422586      985
            ... 
1198950        1
139838         1
134867         1
68667          1
39030          1
Name: total_visits_domain, Length: 17646, dtype: int64
- **total_visits_item** (int64) 

 256      298
279      269
24       262
77       245
305      244
        ... 
11880      1
2483       1
7678       1
9602       1
6251       1
Name: total_visits_item, Length: 9618, dtype: int64
- **total_visits_seller** (int64) 

 1324179    578
1375253    558
803487     540
1353845    478
1267081    474
          ... 
23854        1
30964        1
44695        1
27382        1
7358         1
Name: total_visits_seller, Length: 17762, dtype: int64
- **user_id** (float64) 

 33380364.0     49
302970143.0    25
136144636.0    21
307287134.0    20
212372566.0    18
               ..
158113079.0     1
140695408.0     1
513841702.0     1
210755401.0     1
485647845.0     1
Name: user_id, Length: 139110, dtype: int64
- **ROW_ID** (float64) 

 0.0        1
12813.0    1
12811.0    1
12810.0    1
12809.0    1
          ..
6403.0     1
6402.0     1
6401.0     1
6400.0     1
19210.0    1
Name: ROW_ID, Length: 19211, dtype: int64
- **logistic_type_cross_docking** (Sparse[int64, 0]) 

 1    108346
0     91626
Name: logistic_type_cross_docking, dtype: int64
- **logistic_type_custom** (Sparse[int64, 0]) 

 0    196272
1      3700
Name: logistic_type_custom, dtype: int64
- **logistic_type_default** (Sparse[int64, 0]) 

 0    199645
1       327
Name: logistic_type_default, dtype: int64
- **logistic_type_drop_off** (Sparse[int64, 0]) 

 0    175855
1     24117
Name: logistic_type_drop_off, dtype: int64
- **logistic_type_fulfillment** (Sparse[int64, 0]) 

 0    150890
1     49082
Name: logistic_type_fulfillment, dtype: int64
- **logistic_type_not_specified** (Sparse[int64, 0]) 

 0    193058
1      6914
Name: logistic_type_not_specified, dtype: int64
- **logistic_type_xd_drop_off** (Sparse[int64, 0]) 

 0    192486
1      7486
Name: logistic_type_xd_drop_off, dtype: int64
- **platform_/mobile/android** (Sparse[int64, 0]) 

 1    124781
0     75191
Name: platform_/mobile/android, dtype: int64
- **platform_/mobile/ios** (Sparse[int64, 0]) 

 0    185765
1     14207
Name: platform_/mobile/ios, dtype: int64
- **platform_/web/desktop** (Sparse[int64, 0]) 

 0    164080
1     35892
Name: platform_/web/desktop, dtype: int64
- **platform_/web/mobile** (Sparse[int64, 0]) 

 0    174880
1     25092
Name: platform_/web/mobile, dtype: int64
- **site_id_MLA** (Sparse[int64, 0]) 

 1    199972
Name: site_id_MLA, dtype: int64
- **price_diff** (int64) 

 100     2965
200     2952
300     2376
50      1952
400     1893
        ... 
3451       1
1494       1
3855       1
2525       1
2875       1
Name: price_diff, Length: 3350, dtype: int64
- **cheaper_than_original** (int64) 

 1    199972
Name: cheaper_than_original, dtype: int64

0

In [3]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=[AUC()],
)

model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    verbose=1
)

2023-08-29 12:11:14.340392: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-29 12:11:14.341205: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-29 12:11:14.471233: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


Epoch 1/10


InvalidArgumentError:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_2/Sigmoid:0) = ] [[1][nan][nan]...] [y (Cast_4/x:0) = ] [0]
	 [[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]] [Op:__inference_train_function_1007]

Function call stack:
train_function
