In [1]:
from datetime import datetime

import polars as pl

In [2]:
months = [
    '2022-02-28',
    '2022-03-31',
    '2022-04-30',
    '2022-05-31',
    '2022-06-30',
    '2022-07-31',
    '2022-08-31',
    '2022-09-30',
    '2022-10-31',
    '2022-11-30',
    '2022-12-31',
    '2023-01-31',
]
month2id = dict((month, idx) for idx, month in enumerate(months))
id2month = dict((idx, month) for idx, month in enumerate(months))

In [3]:
def make_features(data, mon):
    min_dt = datetime(2021, 1, 1)
    features = (
        data
        .filter(pl.col("event_time") < datetime.strptime(months[mon], "%Y-%m-%d").replace(day=1))
        .with_columns(event_day=(pl.col("event_time") - min_dt).dt.days())
        .group_by("client_id")
        .agg(
            geo_cnt=pl.count(),
            geo_uniq_days=pl.col("event_day").n_unique(),
            geo_first_day=pl.col("event_day").min(),
            geo_last_day=pl.col("event_day").max(),

            geohash_4_uniq=pl.col("geohash_4").n_unique(),
            geohash_5_uniq=pl.col("geohash_5").n_unique(),
            geohash_6_uniq=pl.col("geohash_6").n_unique(),
        )
        .with_columns(
            geo_len_period=pl.col("geo_last_day") - pl.col("geo_first_day") + 1,
        )
        .with_columns(
            geo_cnt_per_day=pl.col("geo_cnt") / pl.col("geo_uniq_days"),
            geo_density=pl.col("geo_cnt") / pl.col("geo_len_period"),
        )
    )
    return features

In [4]:
mon = 9

In [5]:
import gc
import os
from collections import Counter
from tqdm.auto import tqdm

In [6]:
counter = Counter()
for file in tqdm(os.listdir("./data/geo_train.parquet/")):
    train_geo = pl.read_parquet("./data/geo_train.parquet/" + file)
    counter.update(train_geo["client_id"].unique())

  0%|          | 0/31 [00:00<?, ?it/s]

In [7]:
one = []
two = []
for client_id, count in counter.most_common():
    if count == 1:
        one.append(client_id)
    else:
        two.append(client_id)

In [8]:
len(one), len(two)

(622252, 52)

In [9]:
train_features = []
buf = []
for file in tqdm(os.listdir("./data/geo_train.parquet/")):
    train_geo = pl.read_parquet("./data/geo_train.parquet/" + file)
    buf.append(train_geo.filter(pl.col("client_id").is_in(two)))
    train_geo = train_geo.filter(pl.col("client_id").is_in(one))
    train_features.append(make_features(train_geo, mon))
    gc.collect()

  0%|          | 0/31 [00:00<?, ?it/s]

In [10]:
buf = pl.concat(buf)
train_features.append(make_features(buf, mon))

In [11]:
train_features = pl.concat(train_features)
train_features

client_id,geo_cnt,geo_uniq_days,geo_first_day,geo_last_day,geohash_4_uniq,geohash_5_uniq,geohash_6_uniq,geo_len_period,geo_cnt_per_day,geo_density
str,u32,u32,i64,i64,u32,u32,u32,i64,f64,f64
"""7310575c8a1b87…",117,84,446,636,3,6,27,191,1.392857,0.612565
"""6d315b752ba0fa…",522,179,375,636,3,14,37,262,2.916201,1.992366
"""3b357ed8d1250d…",127,92,366,635,8,24,57,270,1.380435,0.47037
"""3097b3379dd086…",757,245,365,637,26,48,99,273,3.089796,2.772894
"""33e123bbeab26f…",1731,273,365,637,39,98,221,273,6.340659,6.340659
"""f51b4426914208…",2008,272,365,637,18,37,145,273,7.382353,7.355311
"""8cf60d788a84cd…",3560,270,365,636,9,51,218,272,13.185185,13.088235
"""4fe9a6fe59615a…",1170,259,366,637,38,76,215,272,4.517375,4.301471
"""c01304154308b4…",384,112,367,494,24,54,87,128,3.428571,3.0
"""e11760d865da7d…",255,115,377,620,6,14,39,244,2.217391,1.045082


In [12]:
train_features["client_id"].value_counts(sort=True)

client_id,counts
str,u32
"""7310575c8a1b87…",1
"""6d315b752ba0fa…",1
"""3b357ed8d1250d…",1
"""3097b3379dd086…",1
"""33e123bbeab26f…",1
"""f51b4426914208…",1
"""8cf60d788a84cd…",1
"""4fe9a6fe59615a…",1
"""c01304154308b4…",1
"""e11760d865da7d…",1


In [13]:
counter = Counter()
for file in tqdm(os.listdir("./data/geo_test.parquet/")):
    val_geo = pl.read_parquet("./data/geo_test.parquet/" + file)
    counter.update(val_geo["client_id"].unique())

  0%|          | 0/6 [00:00<?, ?it/s]

In [14]:
one = []
two = []
for client_id, count in counter.most_common():
    if count == 1:
        one.append(client_id)
    else:
        two.append(client_id)

In [15]:
len(one), len(two)

(165445, 1302)

In [16]:
val_features = []
buf = []
for file in tqdm(os.listdir("./data/geo_test.parquet/")):
    val_geo = pl.read_parquet("./data/geo_test.parquet/" + file)
    buf.append(val_geo.filter(pl.col("client_id").is_in(two)))
    val_geo = val_geo.filter(pl.col("client_id").is_in(one))
    val_features.append(make_features(val_geo, mon))
    gc.collect()

  0%|          | 0/6 [00:00<?, ?it/s]

In [17]:
buf = pl.concat(buf)
val_features.append(make_features(buf, mon))

In [18]:
val_features = pl.concat(val_features)
val_features

client_id,geo_cnt,geo_uniq_days,geo_first_day,geo_last_day,geohash_4_uniq,geohash_5_uniq,geohash_6_uniq,geo_len_period,geo_cnt_per_day,geo_density
str,u32,u32,i64,i64,u32,u32,u32,i64,f64,f64
"""3dd04e36e7a7ea…",13,13,536,631,1,5,7,96,1.0,0.135417
"""5d5f129d205e90…",132,75,367,487,8,34,67,121,1.76,1.090909
"""0ba80f432aa468…",78,52,554,636,1,5,13,83,1.5,0.939759
"""1fc5b189247f8f…",84,67,368,632,9,13,27,265,1.253731,0.316981
"""ca69e3d03950df…",299,183,365,636,16,62,141,272,1.63388,1.099265
"""14947686f94c15…",110,91,380,630,7,23,54,251,1.208791,0.438247
"""820a14641c0c2c…",698,151,367,588,8,17,55,222,4.622517,3.144144
"""d5f97c94bd72b4…",2821,273,365,637,35,90,285,273,10.333333,10.333333
"""19e69ad40085a0…",41,40,380,627,3,5,12,248,1.025,0.165323
"""3abb7ce634a76a…",368,179,365,630,9,18,32,266,2.055866,1.383459


In [19]:
val_features["client_id"].value_counts(sort=True)

client_id,counts
str,u32
"""3dd04e36e7a7ea…",1
"""5d5f129d205e90…",1
"""0ba80f432aa468…",1
"""1fc5b189247f8f…",1
"""ca69e3d03950df…",1
"""14947686f94c15…",1
"""820a14641c0c2c…",1
"""d5f97c94bd72b4…",1
"""19e69ad40085a0…",1
"""3abb7ce634a76a…",1


In [20]:
train_features.write_parquet(f"./features/train_geo_features_{mon}.pq")
val_features.write_parquet(f"./features/val_geo_features_{mon}.pq")