## Colab install 

In [None]:
# !git clone https://github.com/dllllb/pytorch-lifestream

In [2]:
!cd pytorch-lifestream

In [None]:
# !python setup.py install

In [1]:
# !pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:05[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488513 sha256=260d0e71d56435e6fb3f18af17e1eb719513de0315fa5aa609d62d6919e65ea5
  Stored in directory: /home/user/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected p

## Processing data

In [1]:
DATA_PATH = 'Downloads/'

In [2]:
import pandas as pd

In [6]:
target_train = pd.read_parquet(f'{DATA_PATH}/train_target.parquet')
trx_train = pd.read_parquet(f'{DATA_PATH}/trx_train.parquet')
target_train.mon = pd.to_datetime(target_train.mon)

In [3]:
hashes = ['geohash_4', 'geohash_5', 'geohash_6']
trxs = ['amount', 'client_id', 'event_type', 'event_subtype', \
       'currency', 'src_type11', 'src_type12', 'dst_type11', 'dst_type12', \
       'src_type21', 'src_type22', 'src_type31', 'src_type32']

In [98]:
geo_train_path = 'Downloads/geo_train.parquet'
geo_train = pd.read_parquet(geo_train_path)

In [109]:
def amount_agregate(tg, trx_timeseries, geo_timeseries):

    feats = []
    for mon in tg.mon.unique():
        trx = trx_timeseries[trx_timeseries.event_time < mon]
        geo = geo_timeseries[geo_timeseries.event_time < mon]
        feats_mon = trx.groupby('client_id')[trxs].mean().reset_index()
        feats_mon2 = geo.groupby('client_id')[hashes].mean().reset_index()
        feats_mon['mon'] = mon
        for hash in hashes:
            feats_mon[hash] = feats_mon2[hash]
        
        feats.append(feats_mon)
        print(mon)

    feats = pd.concat(feats, axis=0)
    return feats

feats = amount_agregate(target_train, trx_train, geo_train)

2022-02-28T00:00:00.000000000
2022-03-31T00:00:00.000000000
2022-04-30T00:00:00.000000000
2022-05-31T00:00:00.000000000
2022-06-30T00:00:00.000000000
2022-07-31T00:00:00.000000000
2022-08-31T00:00:00.000000000
2022-09-30T00:00:00.000000000
2022-10-31T00:00:00.000000000
2022-11-30T00:00:00.000000000
2022-12-31T00:00:00.000000000
2023-01-31T00:00:00.000000000


In [114]:
feats = feats.fillna(0)

In [166]:
# Делим на тренировочную и валидационную подвыборки

df_train = target_train.sample(frac=1, random_state=2024).set_index(['mon', 'client_id'])\
.join(feats.set_index(['mon', 'client_id']), how='left')

ids = list(df_train.index.get_level_values('client_id'))[:10000]
df_val = df_train[df_train.index.get_level_values('client_id').isin(ids)]
df_fit = df_train[~df_train.index.get_level_values('client_id').isin(ids)]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
from lightgbm import LGBMClassifier, Dataset
import lightgbm
import datetime
import pickle

In [170]:
# Для каждого таргета фитим модель
RS = 2024

models = {}

params = {'max_depth': 1, 'objective': 'binary',
          'learning_rate': 0.003, 'verbose': -1, 'random_state': RS}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])
    trds = Dataset(X, tg)

    tgv = df_val[target_col]
    Xv = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    vds = Dataset(Xv, tgv)

    lgbm = lightgbm.train(params=params, num_boost_round=1000, train_set=trds, valid_sets=[vds],
                          callbacks=[lightgbm.early_stopping(stopping_rounds=5)])

    models[target_col] = lgbm
    print(f'{target_col} score train: {2 * roc_auc_score(tg, lgbm.predict(X)) - 1}')

print('======================================')
metrics = {}
for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = 2 * roc_auc_score(tg, models[target_col].predict(X)) - 1
    print(f'{target_col} score test: {metrics[target_col]}')

print('======================================')
print(f'avg gini: {sum(metrics.values())/4}')

Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0440796
target_1 score train: 0.45877359163432163
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.00621577
target_2 score train: 0.7037468845913044
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0407502
target_3 score train: 0.5929890895066907
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.0255895
target_4 score train: 0.5833040624408463
target_1 score test: 0.47176879992451193
target_2 score test: 0.6721281113551294
target_3 score test: 0.5831778806155414
target_4 score test: 0.5849523649572483
avg gini: 0.5780067892131078


In [171]:
# Сохраняем обученные модели

with open(f'{DATA_PATH}/models.pkl', 'wb') as fl:
    pickle.dump(models, fl)

In [5]:
# Загружаем тестовые данные

models = pd.read_pickle(f'{DATA_PATH}/models.pkl')
target_test = pd.read_parquet(f'{DATA_PATH}/test_target.parquet')
trx_test = pd.read_parquet(f'{DATA_PATH}/trx_test.parquet')
geo_test = pd.read_parquet(f'{DATA_PATH}/geo_test.parquet')

In [6]:
feats_test = trx_test.groupby('client_id')[trxs].mean()
feats_test2 = geo_test.groupby('client_id')[hashes].mean()

In [20]:
mid = feats_test2.join(feats_test, how='left')

In [21]:
# Т.к. необходимо предсказать таргет на следующий отчетный период -- считаем агрегаты по всей выборке

df_test = target_test[['client_id']].drop_duplicates().set_index('client_id').join(mid, how='left')

In [27]:
df_test = df_test.fillna(0)

In [28]:
# Делаем предсказания для всех таргетов

scores = []

for t in range(1, 5):
    target_col = f'target_{t}'
    score = pd.Series(models[target_col].predict(df_test))
    scores.append(score)

scores = pd.concat(scores, axis=1)
scores['client_id'] = df_test.reset_index().client_id
scores_test = scores.rename(columns={0: 'target_1', 1: 'target_2', 2: 'target_3', 3: 'target_4'})

In [29]:
scores_test.head(3)

Unnamed: 0,target_1,target_2,target_3,target_4,client_id
0,0.023299,0.000576,0.006037,0.010336,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...
1,0.006107,0.002102,0.002717,0.004651,0433d23e224b7a520656da6181efadb8d556bb293158c9...
2,0.005793,0.000544,0.002253,0.008401,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...


In [30]:
scores_test.shape

(140488, 5)

In [31]:
target_ids_submit = pd.read_parquet(f'{DATA_PATH}/test_target_ids.parquet').set_index('client_id')
submission = target_ids_submit.join(scores_test.set_index('client_id'), how='inner')

In [32]:
submission.shape

(140488, 4)

In [33]:
# Сохраняем для сабмита

submission.to_csv(f'{DATA_PATH}/sample_submission.csv', index=False)