# Libraries

In [1]:
from datetime import datetime
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

import numpy as np
import polars as pl

from catboost import CatBoostClassifier, Pool

In [2]:
pl.set_random_seed(56)

# Data

## Video info

In [3]:
video_info = pl.read_csv("./data/video_info_v2.csv")
video_info = video_info.with_columns(pl.col("duration") / 1000)
print(video_info.shape)
video_info.head()

(481480, 5)


rutube_video_id,title,category,duration,author_id
str,str,str,f64,i64
"""video_185549""","""Как собрать бу…","""Хобби""",1559.16,1015054
"""video_111035""","""Осторожно, Киб…","""Сериалы""",1320.007,1002180
"""video_476517""","""ПОПУЛЯРНЫЕ ВИД…","""Хобби""",606.145,1095337
"""video_157198""","""Хороший лжец (…","""Фильмы""",6577.44,1043618
"""video_289824""","""Нашего старого…","""Развлечения""",859.493,1009535


## Events

Считаем локальное время в регионе и присоединяем информацию о видео

In [4]:
time_diffs = pl.read_parquet("./data/time_diffs.parquet")
events = pl.read_csv("./data/train_events.csv", try_parse_dates=True)
events = (
    events
    .join(time_diffs, on="region", how="left")
    .with_columns(
        local_timestamp=pl.col("event_timestamp").dt.offset_by(pl.format("{}h", pl.col("diff")))
    )
    .drop("diff")
)
events = events.join(video_info.drop("title"), on="rutube_video_id", how="left")
print(events.shape)
events.head()

(1759616, 13)


event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,local_timestamp,category,duration,author_id
"datetime[μs, UTC]",str,str,str,str,str,i64,str,i64,"datetime[μs, UTC]",str,f64,i64
2024-06-01 03:40:58 UTC,"""Chelyabinsk""","""desktop""","""browser""","""Windows""","""Yandex Browser…",1883,"""video_133074""",10067243,2024-06-01 08:40:58 UTC,"""Телепередачи""",2456.534,1009219
2024-06-01 16:33:24 UTC,"""Bashkortostan …","""smartphone""","""mobile app""","""Android""","""Rutube""",512,"""video_362960""",10245341,2024-06-01 21:33:24 UTC,"""Юмор""",519.211,1006760
2024-06-01 18:30:43 UTC,"""St.-Petersburg…","""desktop""","""browser""","""Windows""","""Chrome""",5647,"""video_96775""",10894333,2024-06-01 21:30:43 UTC,"""Телепередачи""",5518.28,1009257
2024-06-01 20:03:42 UTC,"""Moscow""","""smartphone""","""mobile app""","""Android""","""Rutube""",1521,"""video_161610""",10029092,2024-06-01 23:03:42 UTC,"""Разное""",1522.069,1058671
2024-06-01 19:48:09 UTC,"""Moscow""","""smartphone""","""mobile app""","""Android""","""Rutube""",71,"""video_116245""",10452976,2024-06-01 22:48:09 UTC,"""Путешествия""",1249.92,1020020


при подсчете будем учитывать только наиболее популярные категории

In [5]:
os_values = (
    events
    .group_by("ua_os")
    .agg(viewers=pl.col("viewer_uid").n_unique())
    .sort("viewers", descending=True)["ua_os"][:16]
)

client_name_values = (
    events
    .group_by("ua_client_name")
    .agg(viewers=pl.col("viewer_uid").n_unique())
    .sort("viewers", descending=True)["ua_client_name"][:32]
)

author_values = (
    events
    .group_by("author_id")
    .agg(viewers=pl.col("viewer_uid").n_unique())
    .sort("viewers", descending=True)["author_id"][:64]
)

## Target

In [6]:
target = pl.read_csv("./data/train_targets.csv")
target = target.with_columns((pl.col("sex") == "male").cast(pl.Int8))
target.head()

viewer_uid,age,sex,age_class
i64,i64,i8,i64
10087154,30,1,1
10908708,25,0,1
10190464,34,1,2
10939673,25,1,1
10288257,48,1,3


разобьем выборку на три равные части

In [7]:
train_target, val_target = train_test_split(target, test_size=1/3, random_state=56, stratify=target["age_class"])

In [8]:
p1_target, p2_target = train_test_split(train_target, test_size=1/2, random_state=56, stratify=train_target["age_class"])

# Feature engineering

## Target encoding

подход анологичный предложенному в базовом решении

сначала считаем среднее значение target для элемента группы, затем для пользователя усредняем по этим значениям для всех элементов из его истории

In [9]:
def target_encoding(data, test_ids, train_ids, group):    
    encoding = (
        data
        .filter(pl.col("viewer_uid").is_in(train_ids["viewer_uid"]))
        .join(train_ids, on="viewer_uid", how="left")
        .group_by(group)
        .agg(
            pl.count().alias(f"{group}_enc_count"),
            pl.col("sex").mean().alias(f"{group}_sex_mean"),
            pl.col("age").mean().alias(f"{group}_age_mean"),
            (pl.col("age_class") == 0).mean().alias(f"{group}_age_0_mean"),
            (pl.col("age_class") == 1).mean().alias(f"{group}_age_1_mean"),
            (pl.col("age_class") == 2).mean().alias(f"{group}_age_2_mean"),
            (pl.col("age_class") == 3).mean().alias(f"{group}_age_3_mean"),
        )
    )
    
    features = (
        data
        .filter(pl.col("viewer_uid").is_in(test_ids["viewer_uid"]))
        .join(encoding, on=group, how="left")
        .drop_nulls()
        .group_by("viewer_uid")
        .agg(
            pl.col(f"{group}_enc_count").mean().alias(f"{group}_enc_count"),
            pl.col(f"{group}_sex_mean").mean().alias(f"{group}_sex_mean_v1"),
            pl.col(f"{group}_age_mean").mean().alias(f"{group}_age_mean"),
            pl.col(f"{group}_age_0_mean").mean().alias(f"{group}_age_0_mean"),
            pl.col(f"{group}_age_1_mean").mean().alias(f"{group}_age_1_mean"),
            pl.col(f"{group}_age_2_mean").mean().alias(f"{group}_age_2_mean"),
            pl.col(f"{group}_age_3_mean").mean().alias(f"{group}_age_3_mean"),
            (
                (pl.col(f"{group}_sex_mean") * pl.col(f"{group}_enc_count")).sum() / pl.col(f"{group}_enc_count").sum()
            ).alias(f"{group}_sex_mean_v2"),
            (
                (pl.col(f"{group}_sex_mean") * pl.col(f"{group}_enc_count") * pl.col("total_watchtime")).sum() / (pl.col(f"{group}_enc_count") * pl.col("total_watchtime")).sum()
            ).alias(f"{group}_sex_mean_v3"),
        )
    )
    
    return features

In [10]:
p1_add = target_encoding(events, p1_target, pl.concat([p2_target, val_target]), "author_id")
p2_add = target_encoding(events, p2_target, pl.concat([p1_target, val_target]), "author_id")
val_add = target_encoding(events, val_target, pl.concat([p1_target, p2_target]), "author_id")

In [11]:
p1_target = p1_target.join(p1_add, on="viewer_uid", how="left").fill_null(-1)
p2_target = p2_target.join(p2_add, on="viewer_uid", how="left").fill_null(-1)
val_target = val_target.join(val_add, on="viewer_uid", how="left").fill_null(-1)

In [12]:
p1_add = target_encoding(events, p1_target, pl.concat([p2_target, val_target]), "rutube_video_id")
p2_add = target_encoding(events, p2_target, pl.concat([p1_target, val_target]), "rutube_video_id")
val_add = target_encoding(events, val_target, pl.concat([p1_target, p2_target]), "rutube_video_id")

In [13]:
p1_target = p1_target.join(p1_add, on="viewer_uid", how="left").fill_null(-1)
p2_target = p2_target.join(p2_add, on="viewer_uid", how="left").fill_null(-1)
val_target = val_target.join(val_add, on="viewer_uid", how="left").fill_null(-1)

In [14]:
val_target.head()

viewer_uid,age,sex,age_class,author_id_enc_count,author_id_sex_mean_v1,author_id_age_mean,author_id_age_0_mean,author_id_age_1_mean,author_id_age_2_mean,author_id_age_3_mean,author_id_sex_mean_v2,author_id_sex_mean_v3,rutube_video_id_enc_count,rutube_video_id_sex_mean_v1,rutube_video_id_age_mean,rutube_video_id_age_0_mean,rutube_video_id_age_1_mean,rutube_video_id_age_2_mean,rutube_video_id_age_3_mean,rutube_video_id_sex_mean_v2,rutube_video_id_sex_mean_v3
i64,i64,i8,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10010270,37,0,2,139320.882353,0.207515,32.666416,0.032954,0.397475,0.399379,0.170192,0.165969,0.184095,5551.294118,0.18813,32.083724,0.033725,0.412542,0.411089,0.142644,0.148688,0.155666
10069632,42,1,3,507.0,0.882627,38.375153,0.009534,0.150942,0.454247,0.385277,0.889941,0.904602,370.6,0.877003,37.907367,0.00729,0.167108,0.468367,0.357236,0.888829,0.893952
10223628,35,1,2,38030.0,0.933053,38.960531,0.005469,0.124349,0.442545,0.427636,0.933053,0.933053,1063.0,0.955786,39.462841,0.005644,0.101599,0.437441,0.455315,0.955786,0.955786
10438963,46,1,3,17766.0,0.657379,34.208319,0.04351,0.335979,0.359563,0.260948,0.657379,0.657379,7.0,0.571429,40.0,0.0,0.142857,0.285714,0.571429,0.571429,0.571429
10328852,39,1,2,130769.5,0.471777,32.711579,0.1461,0.336029,0.196404,0.321467,0.193562,0.193554,5487.0,0.283762,33.316202,0.01622,0.356479,0.466193,0.161108,0.283762,0.283762


## Features

Признаки:

1) на основе времени события

2) на основе продолжительности просмотра и длины видео

3) количество уникальных значений

4) нормализованное количество действий по наиболее популярным категориям


In [15]:
def make_features(data):
    # дополнительные значения
    min_dt = datetime(2024, 5, 1)
    data = data.with_columns(
        event_day=(pl.col("event_timestamp").dt.replace_time_zone(None) - min_dt).dt.days(),
        local_hour=pl.col("local_timestamp").dt.hour(),
        local_weekday=pl.col("local_timestamp").dt.weekday(),
        watchtime_ratio=pl.col("total_watchtime") / pl.col("duration"),
    )
    
    # базовые признаки
    features = (
        data
        .group_by("viewer_uid")
        .agg(
            event_cnt=pl.count(),
            event_uniq_days=pl.col("event_day").n_unique(),
            event_first_day=pl.col("event_day").min(),
            event_last_day=pl.col("event_day").max(),
            
            watchtime_sum=pl.col("total_watchtime").sum(),
            watchtime_min=pl.col("total_watchtime").min(),
            watchtime_max=pl.col("total_watchtime").max(),
            duration_sum=pl.col("duration").sum(),
            duration_min=pl.col("duration").min(),
            duration_max=pl.col("duration").max(),
            watchtime_ratio_mean=pl.col("watchtime_ratio").mean(),
            watchtime_ratio_min=pl.col("watchtime_ratio").min(),
            watchtime_ratio_max=pl.col("watchtime_ratio").max(),
            
            region_uniq=pl.col("region").n_unique(),
            device_uniq=pl.col("ua_device_type").n_unique(),
            client_type_uniq=pl.col("ua_client_type").n_unique(),
            os_uniq=pl.col("ua_os").n_unique(),
            client_name_uniq=pl.col("ua_client_name").n_unique(),
            video_uniq=pl.col("rutube_video_id").n_unique(),
            categ_uniq=pl.col("category").n_unique(),
            author_uniq=pl.col("author_id").n_unique(),
        )
    )
    
    features = (
        features
        .with_columns(
            event_cnt_per_video=pl.col("event_cnt") / pl.col("video_uniq"),
            watchtime_per_video=pl.col("watchtime_sum") / pl.col("video_uniq"),
            duration_per_video=pl.col("duration_sum") / pl.col("video_uniq"),
            event_len_period=pl.col("event_last_day") - pl.col("event_first_day") + 1,
        )
        .with_columns(
            event_cnt_per_day=pl.col("event_cnt") / pl.col("event_uniq_days"),
            event_density=pl.col("event_cnt") / pl.col("event_len_period"),
        )
    )
    
    # замена редких категорий на "other"
    data = data.with_columns(
        pl.when(pl.col("ua_os").is_in(os_values)).then(pl.col("ua_os")).otherwise(pl.lit("other")),
        pl.when(pl.col("ua_client_name").is_in(client_name_values)).then(pl.col("ua_client_name")).otherwise(pl.lit("other")),
        pl.when(pl.col("author_id").is_in(author_values)).then(pl.col("author_id")).otherwise(pl.lit("other")),
    )
    
    # подсчет действий
    for feature in (
        "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category", "author_id", "local_hour", "local_weekday"
    ):
        cat = data.pivot(
            values="rutube_video_id", index="viewer_uid", columns=feature, aggregate_function="count", sort_columns=True
        ).fill_null(0)
        columns = [(col, f"{feature}:{col}-event_cnt") for col in cat.columns[1:]]
        cat = cat.rename(dict(columns))
        features = features.join(cat, on="viewer_uid", how="left")
        for _, col in columns:
            features = features.with_columns(pl.col(col) / pl.col("event_cnt"))
    
    return features

In [16]:
features = make_features(events)
features.head()

viewer_uid,event_cnt,event_uniq_days,event_first_day,event_last_day,watchtime_sum,watchtime_min,watchtime_max,duration_sum,duration_min,duration_max,watchtime_ratio_mean,watchtime_ratio_min,watchtime_ratio_max,region_uniq,device_uniq,client_type_uniq,os_uniq,client_name_uniq,video_uniq,categ_uniq,author_uniq,event_cnt_per_video,watchtime_per_video,duration_per_video,event_len_period,event_cnt_per_day,event_density,ua_device_type:desktop-event_cnt,ua_device_type:smartphone-event_cnt,ua_device_type:tablet-event_cnt,ua_client_type:av-event_cnt,ua_client_type:browser-event_cnt,ua_client_type:mobile app-event_cnt,ua_os:Android-event_cnt,ua_os:BlackBerry OS-event_cnt,ua_os:BlackBerry Tablet OS-event_cnt,…,author_id:1071736-event_cnt,author_id:1084744-event_cnt,author_id:1089828-event_cnt,author_id:1090779-event_cnt,author_id:1090867-event_cnt,author_id:other-event_cnt,local_hour:0-event_cnt,local_hour:1-event_cnt,local_hour:10-event_cnt,local_hour:11-event_cnt,local_hour:12-event_cnt,local_hour:13-event_cnt,local_hour:14-event_cnt,local_hour:15-event_cnt,local_hour:16-event_cnt,local_hour:17-event_cnt,local_hour:18-event_cnt,local_hour:19-event_cnt,local_hour:2-event_cnt,local_hour:20-event_cnt,local_hour:21-event_cnt,local_hour:22-event_cnt,local_hour:23-event_cnt,local_hour:3-event_cnt,local_hour:4-event_cnt,local_hour:5-event_cnt,local_hour:6-event_cnt,local_hour:7-event_cnt,local_hour:8-event_cnt,local_hour:9-event_cnt,local_weekday:1-event_cnt,local_weekday:2-event_cnt,local_weekday:3-event_cnt,local_weekday:4-event_cnt,local_weekday:5-event_cnt,local_weekday:6-event_cnt,local_weekday:7-event_cnt
i64,u32,u32,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10683728,4,4,38,50,631,142,203,1338.253,209.7,404.934,0.505316,0.350674,0.686695,2,2,1,2,2,4,4,4,1.0,157.75,334.56325,13,1.0,0.307692,0.25,0.75,0.0,0.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.25,0.0,0.0,0.25,0.0,0.25,0.25
10015616,1,1,52,52,201,201,201,668.875,668.875,668.875,0.300505,0.300505,0.300505,1,1,1,1,1,1,1,1,1.0,201.0,668.875,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10058336,1,1,33,33,332,332,332,570.167,570.167,570.167,0.582286,0.582286,0.582286,1,1,1,1,1,1,1,1,1.0,332.0,570.167,1,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10094896,2,2,50,54,573,223,350,1459.933,353.896,1106.037,0.473287,0.316445,0.630129,2,1,1,1,1,2,1,2,1.0,286.5,729.9665,5,1.0,0.4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0
10591776,2,2,40,45,643,171,472,908.711,428.011,480.7,0.690712,0.399522,0.981901,2,1,1,1,1,2,2,2,1.0,321.5,454.3555,6,1.0,0.333333,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0


# Training

In [17]:
sex_preds = []
sex_labels = []
age_class_preds = []
age_class_labels = []

## On train part

In [18]:
train_df = pl.concat([p1_target, p2_target]).join(features, on="viewer_uid", how="left").to_pandas()
train_df

Unnamed: 0,viewer_uid,age,sex,age_class,author_id_enc_count,author_id_sex_mean_v1,author_id_age_mean,author_id_age_0_mean,author_id_age_1_mean,author_id_age_2_mean,...,local_hour:7-event_cnt,local_hour:8-event_cnt,local_hour:9-event_cnt,local_weekday:1-event_cnt,local_weekday:2-event_cnt,local_weekday:3-event_cnt,local_weekday:4-event_cnt,local_weekday:5-event_cnt,local_weekday:6-event_cnt,local_weekday:7-event_cnt
0,10926299,21,1,1,5160.000000,0.684302,30.286628,0.045543,0.470930,0.431589,...,0.000000,0.000000,0.0,0.0,0.00,0.0,0.000000,0.0,1.000000,0.0
1,10000148,30,1,1,846.000000,0.518913,31.128842,0.073286,0.530733,0.184397,...,0.000000,0.000000,0.0,0.3,0.20,0.0,0.000000,0.0,0.000000,0.5
2,10936853,41,0,3,4.000000,1.000000,24.250000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.5,0.0,0.00,0.5,0.000000,0.5,0.000000,0.0
3,10208031,39,1,2,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,0.000000,0.000000,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,1.0
4,11020967,44,1,3,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,0.000000,0.000000,0.0,0.0,0.00,0.0,1.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120003,10010209,32,1,2,225887.166667,0.258586,32.277123,0.041735,0.407043,0.390602,...,0.166667,0.166667,0.0,0.5,0.00,0.0,0.333333,0.0,0.166667,0.0
120004,10898450,21,0,1,1985.000000,0.693703,30.612594,0.059446,0.450882,0.420151,...,0.000000,0.000000,0.0,0.0,0.00,0.0,0.000000,0.0,0.000000,1.0
120005,10094810,22,0,1,15781.750000,0.593297,34.263350,0.036912,0.338430,0.361167,...,0.000000,0.000000,0.0,0.0,0.25,0.0,0.000000,0.5,0.250000,0.0
120006,10014570,33,0,2,1631.000000,0.331698,36.097486,0.016554,0.256898,0.426732,...,0.500000,0.500000,0.0,0.0,0.00,0.0,0.000000,0.0,1.000000,0.0


In [19]:
val_df = val_target.join(features, on="viewer_uid", how="left").to_pandas()
val_df

Unnamed: 0,viewer_uid,age,sex,age_class,author_id_enc_count,author_id_sex_mean_v1,author_id_age_mean,author_id_age_0_mean,author_id_age_1_mean,author_id_age_2_mean,...,local_hour:7-event_cnt,local_hour:8-event_cnt,local_hour:9-event_cnt,local_weekday:1-event_cnt,local_weekday:2-event_cnt,local_weekday:3-event_cnt,local_weekday:4-event_cnt,local_weekday:5-event_cnt,local_weekday:6-event_cnt,local_weekday:7-event_cnt
0,10010270,37,0,2,139320.882353,0.207515,32.666416,0.032954,0.397475,0.399379,...,0.047619,0.0,0.142857,0.047619,0.000000,0.047619,0.142857,0.000000,0.619048,0.142857
1,10069632,42,1,3,507.000000,0.882627,38.375153,0.009534,0.150942,0.454247,...,0.000000,0.4,0.000000,0.200000,0.000000,0.200000,0.000000,0.200000,0.000000,0.400000
2,10223628,35,1,2,38030.000000,0.933053,38.960531,0.005469,0.124349,0.442545,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
3,10438963,46,1,3,17766.000000,0.657379,34.208319,0.043510,0.335979,0.359563,...,0.000000,0.0,0.500000,0.000000,0.000000,0.000000,0.500000,0.500000,0.000000,0.000000
4,10328852,39,1,2,130769.500000,0.471777,32.711579,0.146100,0.336029,0.196404,...,0.000000,0.0,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59999,10043026,36,0,2,261535.000000,0.193553,31.923158,0.042201,0.422058,0.392808,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.285714,0.571429,0.142857,0.000000
60000,10009813,26,0,1,102425.823529,0.307541,30.230715,0.055607,0.510494,0.331982,...,0.000000,0.0,0.000000,0.000000,0.058824,0.235294,0.000000,0.117647,0.529412,0.058824
60001,10230457,41,1,3,2925.000000,0.964444,36.008547,0.017436,0.201368,0.533333,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
60002,10663448,36,0,2,6439.833333,0.701164,30.835319,0.046504,0.438356,0.455032,...,0.000000,0.0,0.000000,0.000000,0.333333,0.166667,0.000000,0.000000,0.500000,0.000000


### Sex

In [20]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["sex"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["sex"],
)

In [21]:
params = {
    "task_type": "CPU",
    "eval_metric": "Accuracy",
    "learning_rate": 0.1,
    "max_depth": 7,
}

In [22]:
%%time
model_cb_sex = CatBoostClassifier(**params, random_seed=56)
model_cb_sex.fit(train_pool, eval_set=val_pool, plot=False, verbose=50, use_best_model=True)

0:	learn: 0.7452003	test: 0.7377008	best: 0.7377008 (0)	total: 116ms	remaining: 1m 56s
50:	learn: 0.7624158	test: 0.7545164	best: 0.7545164 (50)	total: 3.4s	remaining: 1m 3s
100:	learn: 0.7673155	test: 0.7553996	best: 0.7560496 (86)	total: 6.95s	remaining: 1m 1s
150:	learn: 0.7730318	test: 0.7566829	best: 0.7566829 (150)	total: 10.3s	remaining: 57.9s
200:	learn: 0.7797897	test: 0.7572662	best: 0.7572662 (198)	total: 13.3s	remaining: 53s
250:	learn: 0.7867809	test: 0.7570495	best: 0.7576995 (241)	total: 16.3s	remaining: 48.8s
300:	learn: 0.7926805	test: 0.7580328	best: 0.7581328 (276)	total: 19.4s	remaining: 45s
350:	learn: 0.7979468	test: 0.7583328	best: 0.7584994 (340)	total: 22.4s	remaining: 41.4s
400:	learn: 0.8022132	test: 0.7585328	best: 0.7588161 (354)	total: 25.4s	remaining: 37.9s
450:	learn: 0.8062712	test: 0.7582661	best: 0.7588161 (354)	total: 28.5s	remaining: 34.8s
500:	learn: 0.8092461	test: 0.7581995	best: 0.7588827 (467)	total: 31.7s	remaining: 31.6s
550:	learn: 0.8134041

<catboost.core.CatBoostClassifier at 0x184aeb3d400>

0.745633624425038

0.759732684487700 (с target_encoding)

In [23]:
model_cb_sex.get_feature_importance(prettified=True).head(20)

Unnamed: 0,Feature Id,Importances
0,rutube_video_id_sex_mean_v1,9.089361
1,rutube_video_id_sex_mean_v2,4.252779
2,rutube_video_id_sex_mean_v3,4.081554
3,rutube_video_id_age_1_mean,3.198964
4,author_id_sex_mean_v1,2.588463
5,watchtime_per_video,2.270722
6,duration_min,1.940077
7,rutube_video_id_enc_count,1.700627
8,author_id_age_1_mean,1.634879
9,category:Телепередачи-event_cnt,1.530459


In [24]:
sex_preds.append(model_cb_sex.predict_proba(val_pool))
sex_labels.append(val_df["sex"])

### Age

In [25]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["age_class"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["age_class"],
)

In [26]:
params = {
    "task_type": "CPU",
    "eval_metric": "TotalF1",
    "learning_rate": 0.15,
}

In [27]:
%%time
model_cb_age_class = CatBoostClassifier(**params, random_seed=56)
model_cb_age_class.fit(train_pool, eval_set=val_pool, plot=False, verbose=50, use_best_model=True)

0:	learn: 0.4213615	test: 0.4178992	best: 0.4178992 (0)	total: 129ms	remaining: 2m 9s
50:	learn: 0.4800021	test: 0.4742008	best: 0.4742008 (50)	total: 5.91s	remaining: 1m 50s
100:	learn: 0.4906421	test: 0.4792886	best: 0.4796181 (99)	total: 11.3s	remaining: 1m 40s
150:	learn: 0.4987955	test: 0.4806374	best: 0.4809906 (138)	total: 16.8s	remaining: 1m 34s
200:	learn: 0.5049951	test: 0.4813375	best: 0.4817012 (187)	total: 22.1s	remaining: 1m 27s
250:	learn: 0.5115229	test: 0.4820314	best: 0.4820314 (250)	total: 27.4s	remaining: 1m 21s
300:	learn: 0.5165858	test: 0.4826162	best: 0.4826162 (300)	total: 32.8s	remaining: 1m 16s
350:	learn: 0.5232915	test: 0.4822315	best: 0.4831528 (336)	total: 38.3s	remaining: 1m 10s
400:	learn: 0.5281395	test: 0.4827006	best: 0.4831531 (391)	total: 43.7s	remaining: 1m 5s
450:	learn: 0.5331644	test: 0.4822294	best: 0.4832056 (408)	total: 49.4s	remaining: 1m
500:	learn: 0.5373893	test: 0.4826230	best: 0.4832056 (408)	total: 54.9s	remaining: 54.7s
550:	learn: 0

<catboost.core.CatBoostClassifier at 0x184af0933a0>

0.470773379195693

0.483205641475747 (с target_encoding)

In [28]:
model_cb_age_class.get_feature_importance(prettified=True).head(20)

Unnamed: 0,Feature Id,Importances
0,rutube_video_id_age_1_mean,7.677689
1,rutube_video_id_age_mean,4.611157
2,rutube_video_id_age_0_mean,4.184959
3,rutube_video_id_age_3_mean,3.156971
4,author_id_age_1_mean,3.033306
5,author_id_age_0_mean,2.83899
6,watchtime_ratio_max,2.807829
7,rutube_video_id_age_2_mean,2.44715
8,author_id_age_3_mean,2.278192
9,duration_min,2.191599


In [29]:
age_class_preds.append(model_cb_age_class.predict_proba(val_pool))
age_class_labels.append(val_df["age_class"])

## On other parts

обучим модели на других частях выыборки, чтобы потом усреднить

In [30]:
train_df = pl.concat([p2_target, val_target]).join(features, on="viewer_uid", how="left").to_pandas()
val_df = p1_target.join(features, on="viewer_uid", how="left").to_pandas()

In [31]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["sex"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["sex"],
)

params = {
    "task_type": "CPU",
    "eval_metric": "Accuracy",
    "learning_rate": 0.1,
    "max_depth": 7,
}

p1_model_cb_sex = CatBoostClassifier(**params, random_seed=56)
p1_model_cb_sex.fit(train_pool, eval_set=val_pool, plot=False, verbose=False, use_best_model=True)

<catboost.core.CatBoostClassifier at 0x184af093e50>

In [32]:
sex_preds.append(p1_model_cb_sex.predict_proba(val_pool))
sex_labels.append(val_df["sex"])

In [33]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["age_class"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["age_class"],
)

params = {
    "task_type": "CPU",
    "eval_metric": "TotalF1",
    "learning_rate": 0.15,
}

p1_model_cb_age_class = CatBoostClassifier(**params, random_seed=56)
p1_model_cb_age_class.fit(train_pool, eval_set=val_pool, plot=False, verbose=False, use_best_model=True)

<catboost.core.CatBoostClassifier at 0x184af0a6be0>

In [34]:
age_class_preds.append(p1_model_cb_age_class.predict_proba(val_pool))
age_class_labels.append(val_df["age_class"])

In [35]:
train_df = pl.concat([p1_target, val_target]).join(features, on="viewer_uid", how="left").to_pandas()
val_df = p2_target.join(features, on="viewer_uid", how="left").to_pandas()

In [36]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["sex"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["sex"],
)

params = {
    "task_type": "CPU",
    "eval_metric": "Accuracy",
    "learning_rate": 0.1,
    "max_depth": 7,
}

p2_model_cb_sex = CatBoostClassifier(**params, random_seed=56)
p2_model_cb_sex.fit(train_pool, eval_set=val_pool, plot=False, verbose=False, use_best_model=True)

<catboost.core.CatBoostClassifier at 0x184aeb3d370>

In [37]:
sex_preds.append(p2_model_cb_sex.predict_proba(val_pool))
sex_labels.append(val_df["sex"])

In [38]:
train_pool = Pool(
    data=train_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=train_df["age_class"],
)

val_pool = Pool(
    data=val_df.drop(["viewer_uid", "age", "sex", "age_class"], axis=1),
    label=val_df["age_class"],
)

params = {
    "task_type": "CPU",
    "eval_metric": "TotalF1",
    "learning_rate": 0.15,
}

p2_model_cb_age_class = CatBoostClassifier(**params, random_seed=56)
p2_model_cb_age_class.fit(train_pool, eval_set=val_pool, plot=False, verbose=False, use_best_model=True)

<catboost.core.CatBoostClassifier at 0x184af0cc370>

In [39]:
age_class_preds.append(p2_model_cb_age_class.predict_proba(val_pool))
age_class_labels.append(val_df["age_class"])

## Scores

In [40]:
sex_preds = np.concatenate(sex_preds)
sex_labels = np.concatenate(sex_labels)

In [41]:
accuracy = (sex_labels == sex_preds.argmax(axis=1)).mean()

In [42]:
age_class_preds = np.concatenate(age_class_preds)
age_class_labels = np.concatenate(age_class_labels)

In [43]:
f1_weighted = f1_score(age_class_labels, age_class_preds.argmax(axis=1), average="weighted")

In [44]:
final_score = 0.7 * f1_weighted + 0.3 * accuracy
print(f'Weighted F1 = {f1_weighted:.4f} \nAccuracy = {accuracy:.4f} \nFinal Score = {final_score:.4f}')

Weighted F1 = 0.4828 
Accuracy = 0.7619 
Final Score = 0.5665


подберем веса для максимизации Weighted F1

In [89]:
scores = []
for w1 in tqdm(range(1, 100, 1)):
    for w4 in range(1, 100 - w1, 1):
        w23 = (100 - w1 - w4) / 2
        scores.append((
            f1_score(age_class_labels, (age_class_preds * np.array([w1, w23, w23, w4])).argmax(axis=1), average="weighted"),
            w1, w23, w23, w4
        ))

  0%|          | 0/99 [00:00<?, ?it/s]

In [None]:
sorted(scores, reverse=True) # 0.4842574587087023

# Inference

In [47]:
test_target = pl.read_csv("./data/subm.csv", columns=["viewer_uid"])
test_target.head()

viewer_uid
i64
14416
5190
8887
55417
8980


In [48]:
test_events = pl.read_csv("./data/test_events.csv", try_parse_dates=True)
test_events = (
    test_events
    .join(time_diffs, on="region", how="left")
    .with_columns(
        local_timestamp=pl.col("event_timestamp").dt.offset_by(pl.format("{}h", pl.col("diff")))
    )
    .drop("diff")
)
test_events = test_events.join(video_info.drop("title"), on="rutube_video_id", how="left")
print(test_events.shape)
test_events.head()

(587735, 13)


event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,local_timestamp,category,duration,author_id
"datetime[μs, UTC]",str,str,str,str,str,i64,str,i64,"datetime[μs, UTC]",str,f64,i64
2024-06-01 12:26:44 UTC,"""Udmurtiya Repu…","""desktop""","""browser""","""Windows""","""Yandex Browser…",2593,"""video_212730""",22206,2024-06-01 16:26:44 UTC,"""Сериалы""",2610.785,1089828
2024-06-01 09:25:29 UTC,"""Adygeya Republ…","""smartphone""","""mobile app""","""Android""","""Rutube""",960,"""video_235114""",34531,2024-06-01 12:25:29 UTC,"""Разное""",1080.32,1009406
2024-06-01 14:23:12 UTC,"""Astrakhan Obla…","""smartphone""","""mobile app""","""Android""","""Rutube""",4695,"""video_26520""",25830,2024-06-01 17:23:12 UTC,"""Фильмы""",5208.416,1090779
2024-06-01 12:37:37 UTC,"""Khakasiya Repu…","""smartphone""","""browser""","""Android""","""Chrome Mobile""",2490,"""video_465561""",14838,2024-06-01 19:37:37 UTC,"""Хобби""",2556.011,1017105
2024-06-01 18:30:11 UTC,"""Moscow""","""smartphone""","""mobile app""","""Android""","""Rutube""",1117,"""video_102934""",13718,2024-06-01 21:30:11 UTC,"""Телепередачи""",8061.64,1009210


для test можно посчитать значения на всей выборке

In [49]:
test_add = target_encoding(pl.concat([events, test_events]), test_target, target, "author_id")

In [50]:
test_target = test_target.join(test_add, on="viewer_uid", how="left").fill_null(-1)

In [51]:
test_add = target_encoding(pl.concat([events, test_events]), test_target, target, "rutube_video_id")

In [52]:
test_target = test_target.join(test_add, on="viewer_uid", how="left").fill_null(-1)

In [53]:
test_target.head()

viewer_uid,author_id_enc_count,author_id_sex_mean_v1,author_id_age_mean,author_id_age_0_mean,author_id_age_1_mean,author_id_age_2_mean,author_id_age_3_mean,author_id_sex_mean_v2,author_id_sex_mean_v3,rutube_video_id_enc_count,rutube_video_id_sex_mean_v1,rutube_video_id_age_mean,rutube_video_id_age_0_mean,rutube_video_id_age_1_mean,rutube_video_id_age_2_mean,rutube_video_id_age_3_mean,rutube_video_id_sex_mean_v2,rutube_video_id_sex_mean_v3
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
14416,94029.688679,0.128869,29.473541,0.052151,0.575404,0.292952,0.079492,0.156139,0.171527,2286.752381,0.126788,29.350403,0.054199,0.571711,0.301161,0.072929,0.163771,0.171362
5190,231808.0,0.10963,32.094009,0.030879,0.418799,0.410555,0.139767,0.10963,0.10963,4888.0,0.103314,32.156915,0.02455,0.42635,0.414689,0.134411,0.103314,0.103314
8887,99310.0,0.171111,34.136557,0.025814,0.326305,0.429428,0.218453,0.135273,0.136265,2157.333333,0.130463,34.17348,0.01401,0.3206,0.461311,0.204079,0.152812,0.167424
55417,5431.333333,0.701369,30.584108,0.050453,0.453114,0.440708,0.055725,0.701178,0.701798,398.0,0.690092,30.646442,0.045321,0.456722,0.451003,0.046955,0.690117,0.693389
8980,391666.0,0.194666,31.948175,0.041783,0.420448,0.394027,0.143742,0.194666,0.194666,8032.6875,0.179933,31.679623,0.034651,0.43858,0.401276,0.125493,0.177307,0.180449


In [56]:
test_features = make_features(test_events)
test_features.head()

viewer_uid,event_cnt,event_uniq_days,event_first_day,event_last_day,watchtime_sum,watchtime_min,watchtime_max,duration_sum,duration_min,duration_max,watchtime_ratio_mean,watchtime_ratio_min,watchtime_ratio_max,region_uniq,device_uniq,client_type_uniq,os_uniq,client_name_uniq,video_uniq,categ_uniq,author_uniq,event_cnt_per_video,watchtime_per_video,duration_per_video,event_len_period,event_cnt_per_day,event_density,ua_device_type:desktop-event_cnt,ua_device_type:smartphone-event_cnt,ua_device_type:tablet-event_cnt,ua_client_type:av-event_cnt,ua_client_type:browser-event_cnt,ua_client_type:mobile app-event_cnt,ua_os:Android-event_cnt,ua_os:BlackBerry OS-event_cnt,ua_os:BlackBerry Tablet OS-event_cnt,…,author_id:1071736-event_cnt,author_id:1084744-event_cnt,author_id:1089828-event_cnt,author_id:1090779-event_cnt,author_id:1090867-event_cnt,author_id:other-event_cnt,local_hour:0-event_cnt,local_hour:1-event_cnt,local_hour:10-event_cnt,local_hour:11-event_cnt,local_hour:12-event_cnt,local_hour:13-event_cnt,local_hour:14-event_cnt,local_hour:15-event_cnt,local_hour:16-event_cnt,local_hour:17-event_cnt,local_hour:18-event_cnt,local_hour:19-event_cnt,local_hour:2-event_cnt,local_hour:20-event_cnt,local_hour:21-event_cnt,local_hour:22-event_cnt,local_hour:23-event_cnt,local_hour:3-event_cnt,local_hour:4-event_cnt,local_hour:5-event_cnt,local_hour:6-event_cnt,local_hour:7-event_cnt,local_hour:8-event_cnt,local_hour:9-event_cnt,local_weekday:1-event_cnt,local_weekday:2-event_cnt,local_weekday:3-event_cnt,local_weekday:4-event_cnt,local_weekday:5-event_cnt,local_weekday:6-event_cnt,local_weekday:7-event_cnt
i64,u32,u32,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
3984,1,1,45,45,3545,3545,3545,5642.24,5642.24,5642.24,0.628297,0.628297,0.628297,1,1,1,1,1,1,1,1,1.0,3545.0,5642.24,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19648,11,7,32,55,35764,342,6375,53818.937,3444.345,6780.76,0.632379,0.09493,0.96132,1,1,1,1,1,10,2,2,1.1,3576.4,5381.8937,24,1.571429,0.458333,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909,0.090909,0.090909,0.0,0.181818,0.181818,0.090909,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.272727,0.0,0.090909,0.0,0.0,0.272727
12192,6,4,32,54,19194,311,6176,20267.116,331.66,5917.0,0.95625,0.598276,1.133078,3,1,1,1,1,6,3,3,1.0,3199.0,3377.852667,23,1.5,0.26087,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.166667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.333333,0.166667
4640,99,19,30,60,208040,31,15355,336732.117,279.446,8216.76,0.662394,0.007225,2.506869,3,1,1,1,1,93,4,13,1.064516,2236.989247,3620.775452,31,5.210526,3.193548,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.242424,0.040404,0.060606,0.020202,0.020202,0.050505,0.050505,0.060606,0.090909,0.080808,0.020202,0.030303,0.0,0.060606,0.020202,0.050505,0.020202,0.060606,0.070707,0.080808,0.040404,0.020202,0.010101,0.010101,0.030303,0.060606,0.10101,0.121212,0.151515,0.010101,0.262626,0.292929
44400,17,13,40,59,29102,111,3932,73700.401,987.0,19258.368,0.654495,0.015978,0.998381,3,1,1,1,1,15,7,3,1.133333,1940.133333,4913.360067,20,1.307692,0.85,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,0.176471,0.0,0.0,0.294118,0.294118,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.176471,0.117647,0.176471,0.117647,0.117647,0.294118,0.0


In [57]:
test_df = test_target.join(test_features, on="viewer_uid", how="left").to_pandas()
test_df

Unnamed: 0,viewer_uid,author_id_enc_count,author_id_sex_mean_v1,author_id_age_mean,author_id_age_0_mean,author_id_age_1_mean,author_id_age_2_mean,author_id_age_3_mean,author_id_sex_mean_v2,author_id_sex_mean_v3,...,local_hour:7-event_cnt,local_hour:8-event_cnt,local_hour:9-event_cnt,local_weekday:1-event_cnt,local_weekday:2-event_cnt,local_weekday:3-event_cnt,local_weekday:4-event_cnt,local_weekday:5-event_cnt,local_weekday:6-event_cnt,local_weekday:7-event_cnt
0,14416,94029.688679,0.128869,29.473541,0.052151,0.575404,0.292952,0.079492,0.156139,0.171527,...,0.0000,0.018868,0.000000,0.188679,0.094340,0.122642,0.084906,0.103774,0.179245,0.226415
1,5190,231808.000000,0.109630,32.094009,0.030879,0.418799,0.410555,0.139767,0.109630,0.109630,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
2,8887,99310.000000,0.171111,34.136557,0.025814,0.326305,0.429428,0.218453,0.135273,0.136265,...,0.0000,0.000000,0.333333,0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,0.666667
3,55417,5431.333333,0.701369,30.584108,0.050453,0.453114,0.440708,0.055725,0.701178,0.701798,...,0.0000,0.000000,0.333333,0.000000,0.000000,0.333333,0.333333,0.333333,0.000000,0.000000
4,8980,391666.000000,0.194666,31.948175,0.041783,0.420448,0.394027,0.143742,0.194666,0.194666,...,0.0625,0.062500,0.062500,0.125000,0.125000,0.062500,0.250000,0.125000,0.250000,0.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59999,16343,145785.571429,0.287807,32.555126,0.034760,0.400747,0.400312,0.164182,0.230023,0.228060,...,0.0000,0.000000,0.071429,0.428571,0.071429,0.000000,0.214286,0.214286,0.000000,0.071429
60000,47183,1808.666667,0.495899,38.038493,0.026791,0.225532,0.326839,0.420838,0.503133,0.489802,...,0.0000,0.000000,0.000000,0.333333,0.000000,0.666667,0.000000,0.000000,0.000000,0.000000
60001,23370,231808.000000,0.109630,32.094009,0.030879,0.418799,0.410555,0.139767,0.109630,0.109630,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
60002,12750,391666.000000,0.194666,31.948175,0.041783,0.420448,0.394027,0.143742,0.194666,0.194666,...,0.0000,0.125000,0.125000,0.000000,0.250000,0.500000,0.187500,0.000000,0.000000,0.062500


In [75]:
test_pool = Pool(
    data=test_df.drop(["viewer_uid"], axis=1),
)

In [76]:
test_sex_preds = 0
test_sex_preds += model_cb_sex.predict_proba(test_pool)
test_sex_preds += p1_model_cb_sex.predict_proba(test_pool)
test_sex_preds += p2_model_cb_sex.predict_proba(test_pool)
test_sex_preds /= 3.

In [77]:
test_age_class_preds = 0
test_age_class_preds += model_cb_age_class.predict_proba(test_pool)
test_age_class_preds += p1_model_cb_age_class.predict_proba(test_pool)
test_age_class_preds += p2_model_cb_age_class.predict_proba(test_pool)
test_age_class_preds /= 3.

# Submission

In [91]:
import pandas as pd

In [92]:
subm = pd.read_csv("./data/subm.csv")
subm

Unnamed: 0,viewer_uid,age,sex,age_class
0,14416,39,female,0
1,5190,12,male,1
2,8887,23,male,0
3,55417,18,female,3
4,8980,48,female,3
...,...,...,...,...
59999,16343,44,female,0
60000,47183,26,female,3
60001,23370,58,male,0
60002,12750,22,male,1


In [93]:
subm.loc[test_sex_preds.argmax(axis=1) == 1, "sex"] = "male"
subm.loc[test_sex_preds.argmax(axis=1) == 0, "sex"] = "female"

In [94]:
subm.loc[:, "age_class"] = (test_age_class_preds * np.array([43, 19, 19, 19])).argmax(axis=1)

In [95]:
subm.to_csv("./base_submission_w.csv", index=False)