In [40]:
import pandas as pd

In [41]:
df_clients = pd.read_csv('../../data/datasets/x5-retail-hero/clients.csv.gz', index_col='client_id')
df_train = pd.read_csv('../../data/datasets/x5-retail-hero/uplift_train.csv.gz', index_col='client_id')
df_test = pd.read_csv('../../data/datasets/x5-retail-hero/uplift_test.csv.gz', index_col='client_id')

# Client only features

### Normalize dates by 1 year

In [42]:
df_features = df_clients.copy()

df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.to_datetime(df_features['first_issue_date']).min()) / pd.Timedelta('365d')

df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.to_datetime(df_features['first_redeem_date']).min()) / pd.Timedelta('365d')

### redeem - issue dates

In [43]:
df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']

### Make gender OHE

In [44]:
df_features = df_features.join(pd.get_dummies(df_features['gender']))

### Fill NA with means

In [45]:
df_features['first_redeem_time'] = df_features['first_redeem_time'].fillna(df_features['first_redeem_time'].mean())
df_features['issue_redeem_delay'] = df_features['issue_redeem_delay'].fillna(df_features['issue_redeem_delay'].mean())

### Drop redundant features

In [46]:
df_features = df_features.drop(['first_issue_date', 'first_redeem_date', 'gender'], axis=1)

In [47]:
df_features

Unnamed: 0_level_0,age,first_issue_time,first_redeem_time,issue_redeem_delay,F,M,U
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000012768d,45,0.336675,0.735365,0.398690,0,0,1
000036f903,72,0.015925,0.033211,0.017286,1,0,0
000048b7a6,68,1.698076,1.208236,0.476130,1,0,0
000073194a,60,0.133622,0.622100,0.488477,1,0,0
00007c7133,67,0.131265,1.724154,1.592889,0,0,1
...,...,...,...,...,...,...,...
fffece623e,67,1.106479,1.713402,0.606924,0,0,1
ffff3dfff8,56,1.577821,1.658431,0.080611,1,0,0
ffffaab9da,23,0.391817,0.661208,0.269391,1,0,0
ffffeb5619,62,0.670666,1.208236,0.476130,0,0,1


In [48]:
df_train.shape

(200039, 2)

In [49]:
df_test.shape

(200123, 0)

In [50]:
df_features.to_csv("../../data/preprocessed/client_features.csv")

# Read jsonl

In [51]:
import json
from pprint import pprint

def loader(path):
    with open(path, "r") as f:
        for line in f.readlines():
            yield json.loads(line)
            
for line in loader("../../data/preprocessed/x5-retail-hero/tmp/jsons/00.jsons.splitted"):
    pprint(line)
    break

{'age': 34,
 'client_id': '0000bcec9c',
 'gender': 'U',
 'target': [{'datetime': '2019-03-03 10:54:09',
             'product_ids': ['233eead1ad'],
             'store_id': '1a346a89e1',
             'tid': '8b5360c3ba'},
            {'datetime': '2019-03-03 15:13:17',
             'product_ids': ['5b379f22d4'],
             'store_id': '1a346a89e1',
             'tid': 'c57538fd1a'},
            {'datetime': '2019-03-04 08:57:46',
             'product_ids': ['baba8b097d'],
             'store_id': 'b1b62ce494',
             'tid': '2413ccdd82'},
            {'datetime': '2019-03-05 01:08:23',
             'product_ids': ['26af08d169',
                             'ea27d5dc75',
                             '8fec8d8366',
                             '3c2684d2e5',
                             '75c1a51a7e'],
             'store_id': '1a346a89e1',
             'tid': 'f8009d804b'},
            {'datetime': '2019-03-06 13:39:58',
             'product_ids': ['ad865591c6'],
             'st

In [52]:
df_features = pd.read_csv("../../data/preprocessed/client_features.csv")
client_features_dict = dict([(i['client_id'], dict([(k, v) for k, v in i.items() if not k in ("client_id", "target")])) for i in df_features.to_dict("records")])

In [53]:
from tqdm import tqdm, trange

for i in trange(8):
    l = []
    with open(f"../../data/preprocessed/x5-uplift/0{i}.jsons.splitted", "w") as f:        
        for line in loader(f"../../data/preprocessed/x5-retail-hero/tmp/jsons/0{i}.jsons.splitted"):
            line.update(client_features_dict[line['client_id']])
            del line['target']
            f.write(json.dumps(line)+"\n")

100%|██████████| 8/8 [00:45<00:00,  5.68s/it]


In [54]:
line

{'age': 51,
 'gender': 'U',
 'client_id': '3831e8106a',
 'transaction_history': [{'tid': '813440d833',
   'datetime': '2018-11-22 18:10:30',
   'products': [{'product_id': '59604886ae',
     'quantity': 3.0,
     's': 90.0,
     'r': '0'},
    {'product_id': '5ceb8bb1de', 'quantity': 1.0, 's': 43.0, 'r': '0'},
    {'product_id': '7c7b3f0fd0', 'quantity': 1.0, 's': 50.0, 'r': '0'},
    {'product_id': '94bfe20a81', 'quantity': 2.0, 's': 67.0, 'r': '0'},
    {'product_id': 'a0a65ec93d', 'quantity': 3.0, 's': 162.0, 'r': '0'},
    {'product_id': '3342a7920a', 'quantity': 1.0, 's': 120.0, 'r': '0'},
    {'product_id': '320e4e901b', 'quantity': 1.0, 's': 100.0, 'r': '0'},
    {'product_id': '46dca4c97a', 'quantity': 1.0, 's': 47.0, 'r': '0'},
    {'product_id': '4009f09b04', 'quantity': 1.0, 's': 5.0, 'r': '0'},
    {'product_id': '52193a3c58', 'quantity': 1.0, 's': 130.0, 'r': '0'},
    {'product_id': '15ccaa8685', 'quantity': 1.0, 's': 30.0, 'r': '0'}],
   'rpr': 9.2,
   'epr': 0.0,
   'rp