In [1]:
from datetime import datetime

import polars as pl

In [2]:
months = [
    '2022-02-28',
    '2022-03-31',
    '2022-04-30',
    '2022-05-31',
    '2022-06-30',
    '2022-07-31',
    '2022-08-31',
    '2022-09-30',
    '2022-10-31',
    '2022-11-30',
    '2022-12-31',
    '2023-01-31',
]
month2id = dict((month, idx) for idx, month in enumerate(months))
id2month = dict((idx, month) for idx, month in enumerate(months))

In [3]:
train_dial = pl.read_parquet("./dial_train.parquet/*")
print(train_dial.shape)
train_dial.head()

(1215209, 3)


client_id,event_time,embedding
str,datetime[μs],list[f32]
"""a039ad3b595d4f…",2022-05-02 12:24:30.565231,"[-0.003248, 0.140231, … -0.010614]"
"""a039ad3b595d4f…",2022-10-28 09:05:18.394466,"[0.058927, -0.007723, … 0.086532]"
"""a039ad3b595d4f…",2022-12-15 08:36:26.987451,"[0.079807, -0.003912, … 0.095575]"
"""a060e69e9e049a…",2022-09-13 13:37:03.710475,"[0.523752, -0.30542, … 0.568542]"
"""a08c690dd972d2…",2022-05-02 09:12:22.795170,"[-0.009235, -0.069714, … -0.062696]"


In [4]:
val_dial = pl.read_parquet("./dial_test.parquet/*")
print(val_dial.shape)
val_dial.head()

(286526, 3)


client_id,event_time,embedding
str,datetime[ns],list[f32]
"""08b3569cdfd015…",2022-02-26 10:53:11.777539,"[0.110589, -0.000545, … 0.134537]"
"""08b3569cdfd015…",2022-05-01 09:37:13.140523,"[0.00209, 0.072185, … 0.015157]"
"""08b3569cdfd015…",2022-05-09 08:48:24.248675,"[0.194512, -0.032053, … 0.160809]"
"""08b3569cdfd015…",2022-03-21 07:02:23.951399,"[0.045035, 0.042004, … -0.112547]"
"""08b3569cdfd015…",2022-03-19 08:55:18.098455,"[0.330715, -0.023786, … 0.342075]"


In [5]:
def make_features(data, mon):
    min_dt = datetime(2021, 1, 1)
    features = (
        data
        .filter(pl.col("event_time") < datetime.strptime(months[mon], "%Y-%m-%d").replace(day=1))
        .with_columns(event_day=(pl.col("event_time") - min_dt).dt.days())
        .group_by("client_id")
        .agg(
            dial_cnt=pl.count(),
            dial_uniq_days=pl.col("event_day").n_unique(),
            dial_first_day=pl.col("event_day").min(),
            dial_last_day=pl.col("event_day").max(),
        )
        .with_columns(
            dial_len_period=pl.col("dial_last_day") - pl.col("dial_first_day") + 1,
        )
        .with_columns(
            dial_cnt_per_day=pl.col("dial_cnt") / pl.col("dial_uniq_days"),
            dial_density=pl.col("dial_cnt") / pl.col("dial_len_period"),
        )
    )
    return features

In [6]:
mon = 9

In [7]:
train_features = make_features(train_dial, mon)
train_features

client_id,dial_cnt,dial_uniq_days,dial_first_day,dial_last_day,dial_len_period,dial_cnt_per_day,dial_density
str,u32,u32,i64,i64,i64,f64,f64
"""8f9884eb7c674b…",1,1,575,575,1,1.0,1.0
"""733793da1060ea…",2,2,560,601,42,1.0,0.047619
"""e8c9e71ddb15cc…",1,1,508,508,1,1.0,1.0
"""542594d1a4c073…",2,2,375,378,4,1.0,0.5
"""1739c16f648a22…",1,1,539,539,1,1.0,1.0
"""582548eedf972e…",1,1,520,520,1,1.0,1.0
"""e747989d830629…",2,2,482,615,134,1.0,0.014925
"""01d6c39ef9db54…",1,1,381,381,1,1.0,1.0
"""ab396dacb3bcb7…",11,11,588,624,37,1.0,0.297297
"""fdcf9bfb346fe4…",1,1,553,553,1,1.0,1.0


In [8]:
val_features = make_features(val_dial, mon)
val_features

client_id,dial_cnt,dial_uniq_days,dial_first_day,dial_last_day,dial_len_period,dial_cnt_per_day,dial_density
str,u32,u32,i64,i64,i64,f64,f64
"""95d58177b3898f…",1,1,619,619,1,1.0,1.0
"""bc7045aee5f443…",2,2,369,495,127,1.0,0.015748
"""810f011714081d…",1,1,418,418,1,1.0,1.0
"""d983062355163c…",2,2,521,548,28,1.0,0.071429
"""7ea7c3186613a7…",3,3,545,635,91,1.0,0.032967
"""7cd728b5c5be88…",2,2,457,469,13,1.0,0.153846
"""5dd4a0a01b1572…",9,9,365,572,208,1.0,0.043269
"""d18ff1ed03664b…",1,1,534,534,1,1.0,1.0
"""60c8036f581cd4…",3,2,618,635,18,1.5,0.166667
"""ce207ecae6c1f1…",1,1,577,577,1,1.0,1.0


In [9]:
train_features.write_parquet(f"./features/train_dial_features_{mon}.pq")
val_features.write_parquet(f"./features/val_dial_features_{mon}.pq")