In [4]:
import json
from pprint import pprint

def loader(path):
    with open(path, "r") as f:
        for line in f.readlines():
            yield json.loads(line)

In [22]:
from datetime import datetime

datetime.strptime("2018-12-03 11:57:24", "%Y-%m-%d %H:%M:%S")

datetime.datetime(2018, 12, 3, 11, 57, 24)

In [47]:
import pandas as pd

def dates(history):
    dates = pd.Series([datetime.strptime(t["datetime"], "%Y-%m-%d %H:%M:%S") for t in history])
    
    dayweek_count = dates.groupby(dates.dt.dayofweek).count()
    dayweek_count /= dayweek_count.sum()
    
    month_count = dates.groupby(dates.dt.month).count()
#     month_count /= month_count.sum()
    month_count = month_count.to_dict()
    
    return {
        **{f"percent_of_transactions_in_{k}_day": v for k, v in dayweek_count.items()},
        **{f"percent_of_transactions_in_{i}_month": month_count.get(i, 0) for i in range(1, 13)}
    }

dates(line['transaction_history'])

{'percent_of_transactions_in_0_day': 0.19148936170212766,
 'percent_of_transactions_in_1_day': 0.1702127659574468,
 'percent_of_transactions_in_2_day': 0.10638297872340426,
 'percent_of_transactions_in_3_day': 0.23404255319148937,
 'percent_of_transactions_in_4_day': 0.14893617021276595,
 'percent_of_transactions_in_5_day': 0.0851063829787234,
 'percent_of_transactions_in_6_day': 0.06382978723404255,
 'percent_of_transactions_in_1_month': 18,
 'percent_of_transactions_in_2_month': 12,
 'percent_of_transactions_in_3_month': 0,
 'percent_of_transactions_in_4_month': 0,
 'percent_of_transactions_in_5_month': 0,
 'percent_of_transactions_in_6_month': 0,
 'percent_of_transactions_in_7_month': 0,
 'percent_of_transactions_in_8_month': 0,
 'percent_of_transactions_in_9_month': 0,
 'percent_of_transactions_in_10_month': 0,
 'percent_of_transactions_in_11_month': 1,
 'percent_of_transactions_in_12_month': 16}

In [48]:
from datetime import datetime
from scipy import stats
import numpy as np

def transaction_features(transaction_history):
    features = dict()
    
    def distr_field(name):
        return {
            f'{name}_mean': np.mean([i[field] for i in transaction_history]),
            f'{name}_std': np.std([i[field] for i in transaction_history]),
            f'{name}_median': np.median([i[field] for i in transaction_history]),
            f'{name}_skew': stats.skew([i[field] for i in transaction_history])
        }
    
    for field in ["epr", "eps", "rpr", "rps", "sum"]:
        features.update(distr_field(field))
        
    def dates(history):
        dates = [datetime.strptime(t["datetime"], "%Y-%m-%d %H:%M:%S") for t in history]
        return {
            'mean_transactions_in_month': np.mean()
        }
        
    return features

transaction_features(line['transaction_history'])

{'epr_mean': 0.0,
 'epr_std': 0.0,
 'epr_median': 0.0,
 'epr_skew': 0.0,
 'eps_mean': 0.0,
 'eps_std': 0.0,
 'eps_median': 0.0,
 'eps_skew': 0.0,
 'rpr_mean': 1.9638297872340424,
 'rpr_std': 2.939635636121469,
 'rpr_median': 1.0,
 'rpr_skew': 2.6422171012593525,
 'rps_mean': 0.0,
 'rps_std': 0.0,
 'rps_median': 0.0,
 'rps_skew': 0.0,
 'sum_mean': 299.84574468085106,
 'sum_std': 311.68728662831603,
 'sum_median': 210.78,
 'sum_skew': 2.4148899974034608}

In [57]:
from tqdm import trange

data_path = "../../data/preprocessed/x5-uplift"

for i in trange(8):
    with open(f"../../data/preprocessed/x5-uplift-features/0{i}.jsons.splitted", "w") as f:
        for line in loader(f"../../data/preprocessed/x5-retail-hero/tmp/jsons/0{i}.jsons.splitted"):
            features = transaction_features(line['transaction_history'])
            del line['transaction_history']
            del line['target']
            line.update(features)
            f.write(json.dumps(line) + "\n")

100%|██████████| 8/8 [03:53<00:00, 29.24s/it]


In [55]:
line

{'age': 51,
 'gender': 'U',
 'client_id': '3831e8106a',
 'target': [{'tid': '683315e17d',
   'datetime': '2019-03-10 08:24:53',
   'product_ids': ['4a29330c8d',
    '4bd01c5234',
    '66eeca18ac',
    '86ffd3a584',
    '343e841aaa',
    'b385cab735',
    '97cb83ba9d',
    'f95785964a',
    '576083639f',
    '56d4b9e8ed'],
   'store_id': '59f9933d4a'}],
 'epr_mean': 0.0,
 'epr_std': 0.0,
 'epr_median': 0.0,
 'epr_skew': 0.0,
 'eps_mean': 0.0,
 'eps_std': 0.0,
 'eps_median': 0.0,
 'eps_skew': 0.0,
 'rpr_mean': 4.299999999999999,
 'rpr_std': 3.2229943077350147,
 'rpr_median': 3.0,
 'rpr_skew': 0.5088169394498658,
 'rps_mean': -2.8461538461538463,
 'rps_std': 9.028554308382283,
 'rps_median': 0.0,
 'rps_skew': -3.133856163357532,
 'sum_mean': 577.4630769230769,
 'sum_std': 308.40349305575984,
 'sum_median': 546.42,
 'sum_skew': 1.5885033241394728}