In [1]:
import gc

import polars as pl

from catboost import CatBoostClassifier, Pool

In [2]:
pl.set_random_seed(56)

In [3]:
months = [
    '2022-02-28',
    '2022-03-31',
    '2022-04-30',
    '2022-05-31',
    '2022-06-30',
    '2022-07-31',
    '2022-08-31',
    '2022-09-30',
    '2022-10-31',
    '2022-11-30',
    '2022-12-31',
    '2023-01-31',
]
month2id = dict((month, idx) for idx, month in enumerate(months))

In [4]:
train_target = pl.read_parquet("./data/train_target.parquet/*")
train_target.head()

mon,target_1,target_2,target_3,target_4,client_id
str,i32,i32,i32,i32,str
"""2022-06-30""",0,0,0,0,"""1d4ebf30ab5b98…"
"""2022-07-31""",0,0,0,0,"""1d4ebf30ab5b98…"
"""2022-08-31""",0,0,0,0,"""1d4ebf30ab5b98…"
"""2022-09-30""",0,0,0,0,"""1d4ebf30ab5b98…"
"""2022-10-31""",0,0,0,0,"""1d4ebf30ab5b98…"


In [5]:
train_target = train_target.sort("mon")

In [6]:
train_target = train_target.with_columns(pl.col("mon").map_dict(month2id).cast(pl.UInt8))

In [7]:
test_target = pl.read_parquet("./data/test_target_b.parquet/*").unique()
test_target.head()

mon,target_1,target_2,target_3,target_4,client_id
str,i32,i32,i32,i32,str
"""2022-08-31""",0,0,0,0,"""148f4f9983995e…"
"""2022-04-30""",0,0,0,0,"""55fcdc186f5a90…"
"""2022-05-31""",0,0,0,0,"""8bcbc3e9372738…"
"""2022-09-30""",0,0,0,0,"""64444e3ca539be…"
"""2022-02-28""",0,0,0,0,"""f92efc449322b7…"


In [8]:
test_target = test_target.sort("mon")

In [9]:
test_target = test_target.with_columns(pl.col("mon").map_dict(month2id).cast(pl.UInt8))

In [10]:
train_dial_features_11 = pl.read_parquet(f"./features/train_dial_features_{11}.pq")
train_geo_features_11 = pl.read_parquet(f"./features/train_geo_features_{11}.pq")
train_trx_features_11 = pl.read_parquet(f"./features/train_trx_features_{11}_cat.pq")

train_dial_features_10 = pl.read_parquet(f"./features/train_dial_features_{10}.pq")
train_geo_features_10 = pl.read_parquet(f"./features/train_geo_features_{10}.pq")
train_trx_features_10 = pl.read_parquet(f"./features/train_trx_features_{10}_cat.pq")

train_dial_features_9 = pl.read_parquet(f"./features/train_dial_features_{9}.pq")
train_geo_features_9 = pl.read_parquet(f"./features/train_geo_features_{9}.pq")
train_trx_features_9 = pl.read_parquet(f"./features/train_trx_features_{9}_cat.pq")

In [11]:
val_dial_features_11 = pl.read_parquet(f"./features/val_dial_features_{11}.pq")
val_geo_features_11 = pl.read_parquet(f"./features/val_geo_features_{11}.pq")
val_trx_features_11 = pl.read_parquet(f"./features/val_trx_features_{11}_cat.pq")

val_dial_features_10 = pl.read_parquet(f"./features/val_dial_features_{10}.pq")
val_geo_features_10 = pl.read_parquet(f"./features/val_geo_features_{10}.pq")
val_trx_features_10 = pl.read_parquet(f"./features/val_trx_features_{10}_cat.pq")

val_dial_features_9 = pl.read_parquet(f"./features/val_dial_features_{9}.pq")
val_geo_features_9 = pl.read_parquet(f"./features/val_geo_features_{9}.pq")
val_trx_features_9 = pl.read_parquet(f"./features/val_trx_features_{9}_cat.pq")

In [12]:
def make_df(data, target_month, dial_features, geo_features, trx_features):
    target = data.filter(pl.col("mon") == target_month)
    
    data = data.filter(pl.col("mon") < target_month)
    
    df = (
        data
        .group_by("client_id")
        .agg(
            target_1_cnt=pl.col("target_1").sum(),
            target_2_cnt=pl.col("target_2").sum(),
            target_3_cnt=pl.col("target_3").sum(),
            target_4_cnt=pl.col("target_4").sum(),
        )
    )
    
    for i in (1, 2, 3, 4):
        cur = (
            data
            .filter(pl.col(f"target_{i}") == 1)
            .group_by("client_id")
            .agg((target_month - pl.col("mon").max()).alias(f"target_{i}_last"))
        )
        df = df.join(cur, on="client_id", how="left").fill_null(-1)
        
    df = df.join(dial_features, on="client_id", how="left").fill_null(-1)
    df = df.join(geo_features, on="client_id", how="left").fill_null(-1)
    df = df.join(trx_features, on="client_id", how="left").fill_null(-1)
    
    return target.join(df, on="client_id", how="left").sort(("client_id", "mon")).to_pandas()

In [13]:
import pandas as pd

In [14]:
train_df = pd.concat([
    make_df(train_target, 9, train_dial_features_9, train_geo_features_9, train_trx_features_9),
    make_df(train_target, 10, train_dial_features_10, train_geo_features_10, train_trx_features_10),
    make_df(train_target, 11, train_dial_features_11, train_geo_features_11, train_trx_features_11),
])
train_df

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id,target_1_cnt,target_2_cnt,target_3_cnt,target_4_cnt,...,trx_len_period,trx_cnt_per_day,trx_density,amt_cnt,amt_sum,amt_min,amt_max,amt_mean,amt_median,amt_range
0,9,0,0,1,0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,...,303,1.935185,1.379538,395,110755200.0,0.019202,13725419.00,2.803929e+05,2407.836670,1.372542e+07
1,9,0,0,0,0,00000c9536a42b45ca93288862cddcbb52a3e1e76f8684...,0,0,0,0,...,-1,-1.000000,-1.000000,-1,-1.0,-1.000000,-1.00,-1.000000e+00,-1.000000,-1.000000e+00
2,9,0,0,0,0,00002ba5d2cf0fe2e9ad4a4397f4634f26e84f58ed48e3...,0,0,0,0,...,301,1.615764,1.089701,328,27318648.0,139.141693,3531107.75,8.328856e+04,29830.580078,3.530968e+06
3,9,0,0,0,0,000030a4067420da425d21ea72d5e647d26cf279e55179...,0,0,0,0,...,268,1.200000,0.201493,54,44068160.0,1.919947,17721738.00,8.160771e+05,23429.347656,1.772174e+07
4,9,0,0,0,0,00004427740977a56f391bc2bbc636803ed933205228f8...,0,0,0,0,...,-1,-1.000000,-1.000000,-1,-1.0,-1.000000,-1.00,-1.000000e+00,-1.000000,-1.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853887,11,0,0,0,0,ffffa99ee602d379ea65e0fbdbfb0c82ed074e28cd3ada...,0,0,0,0,...,319,1.059701,0.222571,71,12575020.0,45.507812,1984717.75,1.771130e+05,48196.054688,1.984672e+06
853888,11,0,0,0,0,ffffa9af8a057b55b18af946e157391cd9f5a5fd9b61cb...,0,0,0,1,...,323,1.425532,0.622291,201,20925224.0,1.763178,1552848.25,1.041056e+05,5948.513672,1.552846e+06
853889,11,0,0,0,0,ffffab5f6ae1c8d04d83ef12e2ad803298737992698079...,2,0,0,0,...,336,1.581522,0.866071,291,23130860.0,36.625301,1740085.00,7.948749e+04,10342.759766,1.740048e+06
853890,11,0,0,0,0,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,...,338,1.191919,0.349112,118,170973232.0,3871.174805,12794200.00,1.448926e+06,407116.125000,1.279033e+07


In [15]:
val_df = pd.concat([
    make_df(test_target, 9, val_dial_features_9, val_geo_features_9, val_trx_features_9),
    make_df(test_target, 10, val_dial_features_10, val_geo_features_10, val_trx_features_10),
])
val_df

Unnamed: 0,mon,target_1,target_2,target_3,target_4,client_id,target_1_cnt,target_2_cnt,target_3_cnt,target_4_cnt,...,trx_len_period,trx_cnt_per_day,trx_density,amt_cnt,amt_sum,amt_min,amt_max,amt_mean,amt_median,amt_range
0,9,0,0,0,0,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,0,0,0,0,...,304,1.437086,0.713816,217,1.147953e+07,52.802216,1.026145e+06,52901.054688,8270.170898,1.026092e+06
1,9,0,0,1,0,0001ac6446bf223a094d6514a6c890d82e9aa92104dee0...,0,0,0,0,...,304,2.196891,1.394737,424,6.166176e+07,0.125596,6.472754e+06,145428.687500,35113.679688,6.472754e+06
2,9,0,0,0,0,0001b878e81279fa43c4429616359b5b276eecc69ddc31...,0,0,0,0,...,259,1.050000,0.081081,21,9.336847e+06,4227.821289,3.991036e+06,444611.750000,130198.226562,3.986808e+06
3,9,0,0,0,0,0003304a0f65d675ddfbc0691e0c564d26a4c9e08edf67...,0,0,0,0,...,305,1.685567,1.072131,327,5.691666e+07,0.027988,7.880474e+06,174057.046875,19582.207031,7.880474e+06
4,9,0,0,0,0,00037813e71deead5685649d494c9a412391942fe771e2...,1,0,0,1,...,91,1.294118,0.241758,22,9.973691e+06,10193.813477,3.209490e+06,453349.593750,73133.242188,3.199296e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48872,10,0,0,0,0,fffb5a08dfd1731d8a64ece41cb427249ae398e8ad684e...,0,0,0,0,...,-1,-1.000000,-1.000000,-1,-1.000000e+00,-1.000000,-1.000000e+00,-1.000000,-1.000000,-1.000000e+00
48873,10,0,0,0,0,fffb6dce2508d4a68c18546b59f6cf0f2dfcebd55035da...,0,0,0,0,...,118,1.000000,0.042373,5,4.746866e+05,9174.087891,4.212281e+05,94937.312500,13418.260742,4.120540e+05
48874,10,0,0,0,0,fffb7431655f360f93a4fe4a2628dee9d476c191dfb05e...,0,0,0,0,...,334,1.805668,1.335329,446,2.050116e+07,0.016002,1.400206e+06,45966.718750,5046.887695,1.400206e+06
48875,10,0,0,0,0,fffbcae1f0b7a651ad9ca6acbe35fd193aeafa2f9bc227...,0,0,0,0,...,334,1.862903,1.383234,462,4.314782e+07,31.141556,1.466791e+06,93393.539062,14448.648438,1.466760e+06


In [16]:
gc.collect()

0

# 1

In [28]:
train_pool = Pool(
    data=train_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=train_df["target_1"],
    cat_features=["mon", "src_type22", "src_type32"]
)

val_pool = Pool(
    data=val_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=val_df["target_1"],
    cat_features=["mon", "src_type22", "src_type32"]
)

In [29]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [30]:
model_cb_1 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_1.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c755a280>

In [31]:
max(model_cb_1.get_evals_result()["validation"]["AUC"])

0.809767872095108

# 2

In [32]:
train_pool = Pool(
    data=train_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=train_df["target_2"],
    cat_features=["mon", "src_type22", "src_type32"]
)

val_pool = Pool(
    data=val_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=val_df["target_2"],
    cat_features=["mon", "src_type22", "src_type32"]
)

In [33]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [34]:
model_cb_2 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_2.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c750b760>

In [35]:
max(model_cb_2.get_evals_result()["validation"]["AUC"])

0.8747016787528992

# 3

In [36]:
train_pool = Pool(
    data=train_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=train_df["target_3"],
    cat_features=["mon", "src_type22", "src_type32"]
)

val_pool = Pool(
    data=val_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=val_df["target_3"],
    cat_features=["mon", "src_type22", "src_type32"]
)

In [37]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [38]:
model_cb_3 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_3.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c779fdf0>

In [39]:
max(model_cb_3.get_evals_result()["validation"]["AUC"])

0.8426342308521271

# 4

In [40]:
train_pool = Pool(
    data=train_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=train_df["target_4"],
    cat_features=["mon", "src_type22", "src_type32"]
)

val_pool = Pool(
    data=val_df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=val_df["target_4"],
    cat_features=["mon", "src_type22", "src_type32"]
)

In [41]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [42]:
model_cb_4 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_4.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215ba1a0040>

In [43]:
max(model_cb_4.get_evals_result()["validation"]["AUC"])

0.909576028585434

# Summary

In [44]:
s = 0.0
s += max(model_cb_1.get_evals_result()["validation"]["AUC"])
s += max(model_cb_2.get_evals_result()["validation"]["AUC"])
s += max(model_cb_3.get_evals_result()["validation"]["AUC"])
s += max(model_cb_4.get_evals_result()["validation"]["AUC"])
s / 4.0

0.8591699525713921

# All

## 1

In [64]:
df = pd.concat([train_df, val_df])
train_pool = Pool(
    data=df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=df["target_1"],
    cat_features=["mon", "src_type22", "src_type32"],
)

In [65]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [66]:
model_cb_1 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_1.fit(train_pool, plot=True, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c740c610>

## 2

In [67]:
df = pd.concat([train_df, val_df])
train_pool = Pool(
    data=df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=df["target_2"],
    cat_features=["mon", "src_type22", "src_type32"],
)

In [68]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 1500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [69]:
model_cb_2 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_2.fit(train_pool, plot=True, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c8d47910>

## 3

In [70]:
df = pd.concat([train_df, val_df])
train_pool = Pool(
    data=df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=df["target_3"],
    cat_features=["mon", "src_type22", "src_type32"],
)

In [71]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2250,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [72]:
model_cb_3 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_3.fit(train_pool, plot=True, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x21641eb1220>

## 4

In [73]:
df = pd.concat([train_df, val_df])
train_pool = Pool(
    data=df.drop(["client_id", "target_1", "target_2", "target_3", "target_4", "src_type21", "src_type31"], axis=1),
    label=df["target_4"],
    cat_features=["mon", "src_type22", "src_type32"],
)

In [74]:
params = {
    "task_type": "GPU",
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "iterations": 2500,
    "max_depth": 7,
    "learning_rate": 0.05,
}

In [75]:
model_cb_4 = CatBoostClassifier(**params, random_seed=56, one_hot_max_size=4)
model_cb_4.fit(train_pool, plot=True, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x215c73fb910>

## Inference

In [76]:
def make_df(data, last_month, dial_features, geo_features, trx_features):
    clients = (
        data
        .group_by("client_id")
        .agg(last_mon=pl.col("mon").max())
        .filter(pl.col("last_mon") == last_month)
    )["client_id"]
    
    data = data.filter(pl.col("client_id").is_in(clients))
    
    df = (
        data
        .group_by("client_id")
        .agg(
            target_1_cnt=pl.col("target_1").sum(),
            target_2_cnt=pl.col("target_2").sum(),
            target_3_cnt=pl.col("target_3").sum(),
            target_4_cnt=pl.col("target_4").sum(),
        )
        .with_columns(
            mon=pl.lit(last_month + 1, pl.UInt8)
        )
    )
    
    for i in (1, 2, 3, 4):
        cur = (
            data
            .filter(pl.col(f"target_{i}") == 1)
            .group_by("client_id")
            .agg((last_month + 1 - pl.col("mon").max()).alias(f"target_{i}_last"))
        )
        df = df.join(cur, on="client_id", how="left").fill_null(-1)
        
    df = df.join(dial_features, on="client_id", how="left").fill_null(-1)
    df = df.join(geo_features, on="client_id", how="left").fill_null(-1)
    df = df.join(trx_features, on="client_id", how="left").fill_null(-1)
    
    return df.sort("client_id").to_pandas()

In [77]:
submission = pd.read_csv("./data/sample_submission.csv", index_col=0).set_index("client_id")
submission

Unnamed: 0_level_0,target_1,target_2,target_3,target_4
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00128b10d2dd4d3e458244fc4a0f2205a86c2058843230727658cd6da1ef94c8,0.001395,0.000347,0.001010,0.000652
00136b2bb7c74fa8fb5362311096d4d69d652064d1cca8f68bf52f7de8ef213b,0.002157,0.000092,0.000929,0.000724
00154d0e99ec74f19a48b13975cf3bd8af6016f73497864553530994f91aa82d,0.002502,0.000790,0.003451,0.002504
0015f88936ad24473f9189b44812d1112a0d5c7950dfb635161b0d96273fa73c,0.002056,0.000107,0.001724,0.000808
00183b1793f295e0caacf88724ff50d680d2685abccab8ce9090cbb88aab01f9,0.007432,0.004129,0.005457,0.022631
...,...,...,...,...
fffb5a08dfd1731d8a64ece41cb427249ae398e8ad684e846f06e8ce0f67578d,0.000026,0.000002,0.000022,0.000007
fffb6dce2508d4a68c18546b59f6cf0f2dfcebd55035da0f789be51ec7d246b1,0.001722,0.000112,0.000429,0.000431
fffb7431655f360f93a4fe4a2628dee9d476c191dfb05ec0a8fdb04c9a6242fd,0.005104,0.000225,0.006404,0.006068
fffbcae1f0b7a651ad9ca6acbe35fd193aeafa2f9bc2276b6bac8b3ccc75b82d,0.002513,0.000202,0.004706,0.006886


In [78]:
test_df = make_df(test_target, 9 - 1, val_dial_features_9, val_geo_features_9, val_trx_features_9)
test_pool = Pool(
    data=test_df.drop(["client_id", "src_type21", "src_type31"], axis=1),
    cat_features=["mon", "src_type22", "src_type32"],
)
submission.loc[test_df["client_id"], "target_1"] = model_cb_1.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_2"] = model_cb_2.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_3"] = model_cb_3.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_4"] = model_cb_4.predict_proba(test_pool)[:, 1]

In [79]:
test_df = make_df(test_target, 10 - 1, val_dial_features_10, val_geo_features_10, val_trx_features_10)
test_pool = Pool(
    data=test_df.drop(["client_id", "src_type21", "src_type31"], axis=1),
    cat_features=["mon", "src_type22", "src_type32"],
)
submission.loc[test_df["client_id"], "target_1"] = model_cb_1.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_2"] = model_cb_2.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_3"] = model_cb_3.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_4"] = model_cb_4.predict_proba(test_pool)[:, 1]

In [80]:
test_df = make_df(test_target, 11 - 1, val_dial_features_11, val_geo_features_11, val_trx_features_11)
test_pool = Pool(
    data=test_df.drop(["client_id", "src_type21", "src_type31"], axis=1),
    cat_features=["mon", "src_type22", "src_type32"],
)
submission.loc[test_df["client_id"], "target_1"] = model_cb_1.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_2"] = model_cb_2.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_3"] = model_cb_3.predict_proba(test_pool)[:, 1]
submission.loc[test_df["client_id"], "target_4"] = model_cb_4.predict_proba(test_pool)[:, 1]

In [81]:
submission.reset_index().to_csv("./submission.csv")