# Setup

In [None]:
VERSION = 'pspfgp-47-xgb'
DEV = False
MODEL_TYPE = 'XGB'  # XGB, LGB, CATBOOST
DATA = 'ALL'  # COMP, COMPLETE_SESSIONS, ALL
BUILD = True
N_BAGS = 10
N_FOLDS = 5
N_SEEDS = 1
EXPIT = True
SEED = 0
GPU = 1
N_THREADS = 8
VERBOSE = True
LEVEL_GROUPS = ['0-4', '5-12', '13-22']

In [None]:
from IPython.display import display, HTML
display(HTML('<style>td{white-space: nowrap !important;}</style>'))

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU)
os.environ['POLARS_MAX_THREADS'] = str(N_THREADS)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import gc
import itertools
import pickle
import re
import time

In [None]:
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import polars as pl
import sklearn.neighbors, sklearn.metrics, sklearn.preprocessing
from tqdm.notebook import tqdm
from xgboost import XGBClassifier

In [None]:
from IPython.display import clear_output

In [None]:
if EXPIT:
    from scipy.special import expit, logit

In [None]:
import random

os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Data

## Dataset

In [None]:
x_test = pd.read_parquet('../data/processed/x_test.parquet')
y_test = pd.read_parquet('../data/processed/y_test.parquet')

In [None]:
%%time
X = pd.concat([pd.read_parquet('../data/processed/x.parquet'), x_test]).reset_index(drop=True)

In [None]:
X = X.sort_values(['session_id', 'index']).reset_index(drop=True)

In [None]:
train_labels = pd.read_parquet('../data/processed/y.parquet')

train_labels['question'] = train_labels.session_id.apply(lambda x: x.split('_')[1]).values
train_labels['session_id'] = train_labels.session_id.apply(lambda x: x.split('_')[0]).values
train_labels['session_id'] = train_labels['session_id'].astype(int)
train_labels['correct'] = train_labels.correct.astype(np.int8).values
group = ['session_id', 'question']
Y = pd.pivot_table(train_labels.groupby(group)['correct'].max().reset_index(), 
                   index='session_id', columns='question', 
                   values='correct', aggfunc='max').reset_index()
Y.columns.name = None
Y = Y[['session_id'] + [f'q{i + 1}' for i in range(18)]]
Y = pd.concat([Y, y_test])
Y = Y.reset_index(drop=True)
Y = Y.set_index('session_id').loc[X['session_id'].drop_duplicates()].reset_index()

In [None]:
if DEV:
    session_ids = X[X['data_source'] == 'comp']['session_id'].drop_duplicates()[:1000]
    X = X[X['session_id'].isin(session_ids)].reset_index(drop=True)
    Y = Y[Y['session_id'].isin(session_ids)].reset_index(drop=True)

### Concat event_name & name

In [None]:
X['event_name_name'] = X['event_name'] + '_' + X['name']

### Prepare

In [None]:
X = X[X['level'] < 23]

In [None]:
%%time
complete_session_ids = Y[Y['q18'] != -1]['session_id']
if DATA == 'COMP':
    x_test = X[(X['session_id'].isin(complete_session_ids)) & 
               (X['data_source'] != 'comp')].reset_index(drop=True)
    y_test = Y[Y['session_id'].isin(x_test['session_id'])].reset_index(drop=True)
    X = X[X['data_source'] == 'comp'].reset_index(drop=True)
    Y = Y[Y['session_id'].isin(X['session_id'])].reset_index(drop=True)
elif DATA == 'COMPLETE_SESSIONS':
    X = X[X['session_id'].isin(complete_session_ids)].reset_index(drop=True)
    Y = Y[Y['session_id'].isin(complete_session_ids)].reset_index(drop=True)

## Folds

In [None]:
COMP_SESSION_IDS = sorted(list(set(X[X['data_source'] == 'comp']['session_id'])))

In [None]:
FOLDS = [[COMP_SESSION_IDS[i::N_FOLDS] for i in range(N_FOLDS)]]
if N_BAGS > 1:
    np.random.seed(SEED)
    for _ in range(N_BAGS - 1):
        index = np.random.randint(0, N_FOLDS, len(COMP_SESSION_IDS))
        FOLDS.append([np.array(COMP_SESSION_IDS)[index == f].tolist() for f in range(N_FOLDS)])

## Collections

In [None]:
if BUILD:
    activities = {
        '0-4': ['tunic', 'report', 'plaque'],
        '5-12': ['businesscards', 'logbook', 'reader', 'wellsbadge', 'journals'],
        '13-22': ['directory', 'reader_flag', 'journals_flag'],
    }
    collections = dict()
    for level_group in tqdm(LEVEL_GROUPS):
        x = X[X['level_group'] == level_group]
        event_names = ['cutscene_click', 'map_click', 'map_hover', 'navigate_click', 
                       'notebook_click', 'notification_click', 'object_click', 
                       'object_hover', 'observation_click', 'person_click']
        tmp = x['fqid'].dropna().value_counts()
        fqids = tmp[tmp > len(x) * 0.001].index.tolist()
        names = x['name'].dropna().drop_duplicates().tolist()
        event_name_names = X['event_name_name'].value_counts().index.tolist()
        room_fqids = x['room_fqid'].dropna().drop_duplicates().tolist()
        texts = x['text'].dropna().drop_duplicates().tolist()
        text_fqids = x['text_fqid'].dropna().drop_duplicates().tolist()
        pages = x['page'].dropna().drop_duplicates().tolist()
        
        collections[level_group] = {
            'event_names': event_names,
            'fqids': fqids,
            'names': names,
            'event_name_names': event_name_names,
            'room_fqids': room_fqids,
            'texts': texts,
            'text_fqids': text_fqids,
            'activities': activities[level_group],
            'pages': pages,
        }
    pickle.dump(collections, open(f'../data/processed/collections_{VERSION}.pkl', 'wb'))

# Functions and classes

## Utils

In [None]:
def get_levels(level_group):
    level_group_split = level_group.split('-')
    level_group_min = int(level_group_split[0])
    level_group_max = int(level_group_split[1])
    return [i for i in range(level_group_min, level_group_max + 1)]

In [None]:
def get_questions(level_group):
    return ([1, 2, 3] if level_group == '0-4' 
            else [4, 5, 6, 7, 8, 9, 10, 11, 12, 13] if level_group == '5-12' 
            else [14, 15, 16, 17, 18])

In [None]:
def get_level_group(question):
    if question < 4:
        return '0-4'
    elif question < 14:
        return '5-12'
    return '13-22'

In [None]:
def clean_feature_name(feature_name):
    return re.sub('[^A-Za-z0-9_]', '_', str(feature_name))

In [None]:
def get_features(question):
    features = pickle.load(open(f"../models/{VERSION}/features.pkl", "rb"))
    return features[f"q{question}"]

## Evaluation

In [None]:
def fast_f1_score(labels, preds):
    all_positives = (preds + labels == 2).mean()
    all_negatives = (preds + labels == 0).mean()
    score = 1 - (1 - all_negatives - all_positives) / (1 - (all_negatives - all_positives) ** 2)
    return score

In [None]:
def score_questions(preds, labels, questions, thr):
    thresholds = thr if type(thr) == list else [thr] * 18
    scores = dict()
    preds_binarized = preds.copy()
    for question in questions:
        preds_binarized[f'q{question}'] = (preds[f'q{question}'].values > thresholds[question - 1]).astype('int')
        score = fast_f1_score(labels[f'q{question}'].values, preds_binarized[f'q{question}'].values)
        scores[f'q{question}'] = np.round(score, decimals=5)
    if len(questions) > 1:
        score = fast_f1_score(labels.melt().drop('variable', axis=1).values, 
                              preds_binarized.melt().drop('variable', axis=1).values)
        scores['overall'] = np.round(score, decimals=5)
    return scores

In [None]:
def optimize_threshold(preds, labels, step_size=0.005):
    thresholds = []
    best_score = 0
    best_threshold = 0
    all_labels = labels.melt().drop('variable', axis=1).values
    all_preds = preds.melt().drop('variable', axis=1).values    
    for threshold in np.arange(0.5, 0.81, step_size):
        binarized_preds = (all_preds > threshold).astype('int')
        score = fast_f1_score(all_labels, binarized_preds)
        thresholds.append(threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [None]:
def optimize_thresholds(preds, labels, threshold):
    thrs = []
    binarized_preds = preds.copy()
    binarized_preds = 1 * (binarized_preds >= threshold)
    binarized_preds
    for q in range(1, 18 + 1):
        best_score = 0
        for thr in np.arange(0.5, 0.8, 0.005):
            oof_preds = binarized_preds.copy()
            oof_preds[f'q{q}'] = 1 * (preds[[f'q{q}']] >= thr)
            score = score_questions(oof_preds, labels, range(1, 18 + 1), thr=thr)['overall']
            if score > best_score:
                best_score = score
                best_thr = thr
        thrs.append(best_thr)
    return thrs

In [None]:
def compute_score(preds, labels):
    threshold = np.round(optimize_threshold(preds, labels), 3)
    scores = score_questions(preds, labels, questions=range(1, 18 + 1), thr=threshold)
    return scores, threshold

In [None]:
def evaluate_test(data, targets, thr, n_bags, n_folds, n_seeds, n_sets=2, verbose=True):
    models = get_models([q for q in range(1, 18 + 1)], n_bags=N_BAGS, n_folds=N_FOLDS, n_seeds=N_SEEDS)
    predictions = pd.DataFrame(index=targets.index)
    for q in range(1, 18 + 1):
        x, _ = data[f'q{q}']
        feats = get_features(q)
        preds = []
        for b in range(n_bags):
            for f in range(n_folds):
                for s in range(n_seeds):
                    k = f'q{q}b{b}f{f}s{s}'
                    inputs = x[feats].fillna(-999999).astype(np.float32).values
                    pred = (models[k].predict_proba(inputs)[:, 1] if MODEL_TYPE != 'LGB' 
                            else models[k].predict(inputs))
                    preds.append(pred)
        preds = expit(np.mean(logit(preds), axis=0)) if EXPIT else np.mean(preds, axis=0)
        predictions[f'q{q}'] = preds
    score = score_questions(predictions, targets, questions=range(1, 18 + 1), thr=thr)
    df = pd.DataFrame([score])
    df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]] = np.round(df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]], 3)
    return score, df, predictions

In [None]:
def get_importances(features, question, n_bags, n_folds, n_seeds):
    models = get_models([question], n_bags, n_folds, n_seeds)
    importances = pd.DataFrame()
    importances['feature'] = features
    questions = []
    for b in range(n_bags):
        for s in range(n_seeds):
            for f in range(n_folds):
                name = f'q{question}b{b}f{f}s{s}'
                questions.append(name)
                if MODEL_TYPE == 'XGB':
                    importances[name] = models[name].feature_importances_
                elif MODEL_TYPE == 'LGB':
                    importances[name] = models[name].feature_importance(importance_type='split')
                elif MODEL_TYPE == 'CATBOOST':
                    importances[name] = models[name].get_feature_importance()
    importances[f'q{question}'] = importances[questions].mean(axis=1)
    
    n_features = len(importances['feature'])
    importances_plot = plt.figure(figsize=(6, n_features // 4))
    index = importances.sort_values(f'q{question}').index.tolist()
    plt.barh(importances['feature'][index][-n_features:], 
             importances[f'q{question}'][index][-n_features:], alpha=0.25)
    plt.margins(y=0)
    plt.box(False)
    plt.close(importances_plot)
    return {'df': importances, 'plot': importances_plot}

In [None]:
def explain(features, questions, n_bags, n_folds, n_seeds):
    importances = dict()
    for question in questions:
        importances[f'q{question}'] = get_importances(features, question, n_bags, n_folds, n_seeds)
    return importances

## Model

In [None]:
def get_models(questions, n_bags, n_folds, n_seeds):
    models = dict()
    for q in questions:
        for b in range(n_bags):
            for f in range(n_folds):
                for s in range(n_seeds):
                    model_name = f'q{q}b{b}f{f}s{s}'
                    if MODEL_TYPE == 'XGB':
                        model = XGBClassifier(n_jobs=8)
                        model.load_model(f'../models/{VERSION}/{model_name}.xgb')
                    elif MODEL_TYPE == 'LGB':
                        model = pickle.load(open(f'../models/{VERSION}/{model_name}.lgb', 'rb'))
                    elif MODEL_TYPE == 'CATBOOST':
                        model = CatBoostClassifier()
                        model.load_model(f'../models/{VERSION}/{model_name}.cbm')
                    models[model_name] = model
    return models

In [None]:
def train_xgb_model(x_train, y_train, x_val, y_val, model_name, seed, verbose=0):
    xgb_params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.02,
        'max_depth': 4,
        'alpha': 4,
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist',
        'subsample': 0.8,
        'colsample_bytree': 0.2,
        'use_label_encoder': False,
        'n_jobs': 8,
        'seed': seed,
    }           
    model = XGBClassifier(**xgb_params)
    model.fit(
        x_train, 
        y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        verbose=20 if verbose >= 2 else 0,
    )       
    model.save_model(f'../models/{VERSION}/{model_name}.xgb')    
    return model

In [None]:
def train_lgb_model(x_train, y_train, x_val, y_val, model_name, seed, verbose=0):        
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 2 ** 4,
        'min_data_in_leaf': 50,
        'max_depth': 5,
        'colsample_bytree': 0.2,
        'linear_lambda': 1,
        'verbose': -1,
        'seed': seed,
        'n_jobs': 8,
    }
    
    lgb_train = lgb.Dataset(x_train, label=y_train)
    lgb_val = lgb.Dataset(x_val, label=y_val)

    early_stopping_callback = lgb.early_stopping(100, first_metric_only=False, verbose=False)
    verbose_callback = lgb.log_evaluation(0)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        num_boost_round=10000,
        callbacks=[early_stopping_callback, verbose_callback]
    )

    pickle.dump(model, open(f'../models/{VERSION}/{model_name}.lgb', 'wb'))
    
    return model

In [None]:
def train_catboost_model(x_train, y_train, x_val, y_val, model_name, seed, verbose=0):
    train_pool = Pool(x_train.astype(np.float32), y_train)
    val_pool = Pool(x_val.astype(np.float32), y_val)
    
    model = CatBoostClassifier(
        iterations=10000,
        early_stopping_rounds=100,
        depth=4,
        learning_rate=0.05,
        loss_function='Logloss',
        subsample=0.8,
        colsample_bylevel=0.0,
        verbose=0,
        random_seed=seed,
        thread_count=12,
    )
    model = model.fit(train_pool, eval_set=val_pool)   
    model.save_model(f'../models/{VERSION}/{model_name}.cbm')
    
    return model

In [None]:
def train_gbdt(x_train, y_train, x_val, y_val, model_name, model_type, seed, verbose=0):        
    if model_type == 'XGB':
        model = train_xgb_model(x_train, y_train, x_val, y_val, model_name, seed, verbose)
    elif model_type == 'LGB':
        model = train_lgb_model(x_train, y_train, x_val, y_val, model_name, seed, verbose)
    elif model_type == 'CATBOOST':
        model = train_catboost_model(x_train, y_train, x_val, y_val, model_name, seed, verbose)
    oof_predictions = (model.predict_proba(x_val)[:, 1] if model_type != 'LGB'
                       else model.predict(x_val))
    return oof_predictions

## Data

In [None]:
class AggsBuilder:
    def __init__(self):
        self.collections = pickle.load(open(f'../data/processed/collections_{VERSION}.pkl', 'rb'))
        self.aggs = []
        
    def doc(self):
        to_exclude = ['collections', 'aggs']
        return [el for el in dir(self) if '__' not in el and el not in to_exclude]
        
    def collect(self):
        aggs = self.aggs
        self.aggs = []
        return aggs
    
    def clear(self):
        self.aggs = []
        
    def add(self, *aggregations):
        self.aggs.extend([*aggregations])
        
    def add_durations(self, level_group):
        fqids = self.collections[level_group]['fqids']
        room_fqids = self.collections[level_group]['room_fqids']
        event_name_names = self.collections[level_group]['event_name_names']
        texts = self.collections[level_group]['texts']
        levels = get_levels(level_group)
        activities = self.collections[level_group]['activities']
        pages = self.collections[level_group]['pages']
        
        text_root = pl.col('duration').filter(~pl.col('text').is_null())
        
        self.add(pl.col('duration').drop_nulls().sum()
                 .alias(f'level_group_{level_group}_duration_sum'))
        
        self.add(*[pl.col('duration').filter(pl.col('room_fqid') == r).sum()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_duration_sum') for r in room_fqids])
        
        self.add(*[pl.col('duration').filter(pl.col('fqid') == f).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_duration_sum') for f in fqids])
        
        self.add(*[pl.col('duration').filter(pl.col('text') == t).sum()
                   .alias(f'level_group_{level_group}_text_{t}_duration_sum') for t in texts])
        
        self.add(*[pl.col('duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_activity_{f}_duration_sum') for f in activities])
        
        self.add(*[pl.col('hover_duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_hover_duration_sum') for f in fqids])

        self.add(*[pl.col('duration').filter(pl.col('event_name_name') == e).sum()
                   .alias(f'level_group_{level_group}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names])

        self.add(*[pl.col('duration').filter((pl.col('level') == l) & (pl.col('event_name_name') == e)).sum()
                   .alias(f'level_{l}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names for l in levels])

        self.add(*[pl.col('duration').filter((pl.col('room_fqid') == r) & (pl.col('event_name_name') == e)).sum()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names for r in room_fqids])
        
        self.add(*[(pl.col('elapsed_time').filter(pl.col('fqid').str.contains(f)).max() - 
                    pl.col('elapsed_time').filter(pl.col('fqid').str.contains(f)).min())
                   .alias(f'level_group_{level_group}_activity_{f}_elapsed_time_diff') for f in activities])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('fqid') == f).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_prev_duration_sum') for f in fqids])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('text') == t).sum()
                   .alias(f'level_group_{level_group}_text_{t}_prev_duration_sum') for t in texts])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_activity_{f}_prev_duration_sum') for f in activities])

        return self
    
    def add_counts(self, level_group):
        fqids = self.collections[level_group]['fqids']
        room_fqids = self.collections[level_group]['room_fqids']
        event_name_names = self.collections[level_group]['event_name_names']
        texts = self.collections[level_group]['texts']
        levels = get_levels(level_group)
        activities = self.collections[level_group]['activities']
        pages = self.collections[level_group]['pages']
        
        self.add(pl.col('index').count()
                 .alias(f'level_group_{level_group}_cnt'))
        
        self.add(*[pl.col('index').filter(pl.col('room_fqid') == r).count()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_cnt') for r in room_fqids])
        
        self.add(*[pl.col('index').filter(pl.col('fqid').str.contains(f)).count()
                   .alias(f'level_group_{level_group}_activity_{f}_cnt') for f in activities])
        
        self.add(*[pl.col('index').filter(pl.col('level') == l).count()
                   .alias(f'level_{l}_cnt') for l in levels])
        
        self.add(*[pl.col('index').filter(pl.col('fqid') == f).count()
                   .alias(f'level_group_{level_group}_fqid_{f}_cnt') for f in fqids])
        
        self.add(*[pl.col('index').filter(pl.col('text_fqid') == t).count()
                   .alias(f'level_group_{level_group}_text_fqid_{t}_cnt') for t in text_fqids])
        
        self.add(*[pl.col('index').filter(pl.col('page') == p).count()
                   .alias(f'level_group_{level_group}_page_{p}_cnt') for p in pages])
        
        self.add(*[pl.col('index').filter(pl.col('event_name_name') == e).count()
                   .alias(f'level_group_{level_group}_event_name_name_{e}_cnt') for e in event_name_names])
        
        self.add(*[pl.col('index').filter((pl.col('event_name_name') == e) & (pl.col('level') == l)).count()
                   .alias(f'level_group_{level_group}_level_{l}_event_name_name_{e}_cnt') 
                   for e in event_name_names for l in levels])
        
        self.add(*[pl.col('index').filter((pl.col('event_name_name') == e) & (pl.col('room_fqid') == r)).count()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_event_name_name_{e}_cnt') 
                   for e in event_name_names for r in room_fqids])
        
        return self
    
    def add_mouse(self, level_group):
        filter_condition = (pl.col('event_name') == 'object_click') & (pl.col('name') == 'basic')
        activities = self.collections[level_group]['activities']
        
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).mean()
                   .alias(f'level_group_{level_group}_{f}_room_coor_x_mean') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).std()
                   .alias(f'level_group_{level_group}_{f}_room_coor_x_std') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).mean()
                   .alias(f'level_group_{level_group}_{f}_room_coor_y_mean') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).std()
                   .alias(f'level_group_{level_group}_{f}_room_coor_y_std') for f in activities])
        
        return self
        
    def add_notebook(self, level_group):
        self.add(*[pl.col('duration').filter((pl.col('level') == l) & 
                                             ((pl.col('event_name') == 'notebook_click') & 
                                              (pl.col('name') != 'close'))).sum()
                   .alias(f'level_{l}_notebook_duration_sum') for l in get_levels(level_group)])
        return self

    def add_globals(self, level_group):
        return self

In [None]:
class Data:
    def __init__(self, x, y, mode='train'):
        self.x = x
        self.y = y
        self.session_ids = {
            '0-4': y[y['q13'] != -1]['session_id'].tolist(),
            '5-12': y[y['q13'] != -1]['session_id'].tolist(),
            '13-22': y[y['q18'] != -1]['session_id'].tolist(),
        }
        self.mode = mode
        self.aggs_builder = AggsBuilder()
        self.level_group_data = {}
        self.engineer()
        
    def engineer(self):
        for level_group in LEVEL_GROUPS:
            x = self.x[self.x['level_group'] == level_group]
            y = self.y.copy()
            x['session_id'] = x['session_id'].astype(int)
            x_lg = self.engineer_level_group_features(x, level_group)
            if self.mode == 'train':
                x_lg = self.delete_features(x_lg)
            x_lg.columns = [clean_feature_name(c) for c in x_lg.columns]
            self.level_group_data[level_group] = (x_lg, y)
            print(level_group, len(x_lg.columns))
            
    def __getitem__(self, question_str):
        question = int(question_str[1:])
        if question < 4:
            x = self.level_group_data['0-4'][0]
            x = x[x['session_id'].isin(self.session_ids['0-4'])].reset_index(drop=True)
        elif question < 14:
            x = self.level_group_data['0-4'][0].merge(
                self.level_group_data['5-12'][0], on='session_id', how='left')
            x = x[x['session_id'].isin(self.session_ids['5-12'])].reset_index(drop=True)
        else:
            x = self.level_group_data['0-4'][0].merge(
                self.level_group_data['5-12'][0], on='session_id', how='left')
            x = x.merge(self.level_group_data['13-22'][0], on='session_id', how='left')
            x = x[x['session_id'].isin(self.session_ids['13-22'])].reset_index(drop=True)
        y = self.y[self.y['session_id'].isin(x['session_id']) & 
                   (self.y[question_str] > -1)][['session_id', question_str]].set_index('session_id')
        x = x[x['session_id'].isin(y.index)].reset_index(drop=True)
        return x, y
        
    def engineer_level_group_features(self, x, level_group):
        columns = [((pl.col('elapsed_time').shift(-1) - pl.col('elapsed_time')).fill_null(0)
                    .over(['session_id', 'level_group'])
                    .alias('duration')),
                   ((pl.col('elapsed_time').shift(1) - pl.col('elapsed_time')).fill_null(0)
                    .over(['session_id', 'level_group'])
                    .alias('prev_duration'))]
        aggs = (self.aggs_builder
                .add_durations(level_group)
                .add_counts(level_group)
                .add_mouse(level_group)
                .add_notebook(level_group)
                .collect())
        return (pl.from_pandas(x)
                .lazy()
                .drop(['fullscreen', 'hq', 'music'])
                .with_columns(columns)
                .groupby(['session_id'], maintain_order=True)
                .agg(aggs)
                .sort(['session_id'])
                .collect()
                .to_pandas())
    
    def delete_features(self, x):
        col_to_delete = []
        for c in x.columns[1:]:
            if len(set(x[c].fillna(-999999))) == 1:
                col_to_delete.append(c)
            else:
                value_counts = x[c].value_counts()
                if value_counts.sum() < 10:
                    col_to_delete.append(c)
        x.drop(col_to_delete, axis=1, inplace=True)
        return x

# Training

In [None]:
def train(data, targets, bags, n_seeds, model_type=MODEL_TYPE, verbose=0):
    features = {}
    oof_predictions = targets[[]].copy()
    outputs = []
    for b, folds in enumerate(bags):
        results = []
        for f, session_ids in enumerate(folds):
            for q in tqdm(range(1, 18 + 1)):
                x, y = data[f'q{q}']
                y = y.reset_index()
                features[f'q{q}'] = x.columns[1:].tolist()
                x_train = x[~x['session_id'].isin(session_ids)][features[f'q{q}']]
                x_train = x_train.fillna(-999999).astype(np.float32).values
                y_train = y[~y['session_id'].isin(session_ids)][f'q{q}'].values
                x_val = x[x['session_id'].isin(session_ids)][features[f'q{q}']]
                x_val = x_val.fillna(-999999).astype(np.float32).values
                y_val = y[y['session_id'].isin(session_ids)][f'q{q}'].values
                for s in range(n_seeds):
                    preds = train_gbdt(x_train, y_train, x_val, y_val, 
                                       f'q{q}b{b}f{f}s{s}', model_type, seed=s, verbose=verbose)
                    oof_predictions.loc[session_ids, f'q{q}b{b}s{s}'] = preds
                preds = oof_predictions[[f'q{q}b{b}s{s}' for s in range(n_seeds)]]
                oof_predictions[f'q{q}b{b}'] = (expit(np.mean(logit(preds), axis=1)) if EXPIT 
                                                else np.mean(preds, axis=1))
            oof_preds = oof_predictions[[f'q{q}b{b}' for q in range(1, 18 + 1)]].loc[session_ids]
            oof_preds.columns = [f'q{q}' for q in range(1, 18 + 1)]
            y_oof = targets.loc[session_ids]
            threshold = optimize_threshold(oof_preds, y_oof, step_size=0.001)
            result = score_questions(oof_preds, y_oof, questions=range(1, 18 + 1), thr=threshold)
            for q in range(1, 18 + 1):
                result[f'q{q}'] = np.round(result[f'q{q}'], 3)
            result['thr'] = np.round(threshold, 3)
            results.append(result)
            clear_output()
            for df in outputs:
                display(df)
            display(pd.DataFrame(results, index=[f'b{b}f{f}' for f in range(len(results))]))
        session_ids = oof_predictions[oof_predictions[f'q1b{b}'].notnull()].index.tolist()
        oof_preds = oof_predictions[[f'q{q}b{b}' for q in range(1, 18 + 1)]].loc[session_ids]
        oof_preds.columns = [f'q{q}' for q in range(1, 18 + 1)]
        threshold = optimize_threshold(oof_preds, targets.loc[session_ids], step_size=0.001)
        result = score_questions(oof_preds, targets.loc[session_ids], questions=range(1, 18 + 1), thr=threshold)
        result['thr'] = np.round(threshold, 3)
        for q in range(1, 18 + 1):
            result[f'q{q}'] = np.round(result[f'q{q}'], 3)
        results.append(result)
        index = [f'b{b}f{f}' for f in range(len(folds))] + [f'b{b}']
        outputs.append(pd.DataFrame(results, index=index))
        clear_output()
        for df in outputs:
            display(df)
    for q in range(1, 18 + 1):
        preds = oof_predictions[[f'q{q}b{b}s{s}' for s in range(n_seeds) for b in range(N_BAGS)]]
        oof_predictions[f'q{q}'] = expit(np.mean(logit(preds), axis=1)) if EXPIT else np.mean(preds, axis=1)
    return oof_predictions, features

In [None]:
! rm -rf ../models/"$VERSION"
! mkdir ../models/"$VERSION"

In [None]:
%%time
train_data = Data(X, Y, mode='train')
targets = Y[Y['session_id'].isin(COMP_SESSION_IDS)].set_index('session_id')
oof_predictions, features = train(train_data, targets, bags=FOLDS, n_seeds=N_SEEDS)
importances = {f'q{q}': explain(features[f'q{q}'], [q], n_bags=N_BAGS, n_folds=N_FOLDS, n_seeds=N_SEEDS)[f'q{q}'] 
               for q in range(1, 18 + 1)}

In [None]:
pickle.dump(features, open(f'../models/{VERSION}/features.pkl', 'wb'))
len(features['q1']), len(features['q4']), len(features['q14'])

# Scoring

In [None]:
oof_preds = oof_predictions[[f'q{q}' for q in range(1, 18 + 1)]]
y_oof = Y[Y['session_id'].isin(oof_predictions.index)].set_index('session_id')

In [None]:
pickle.dump(oof_preds, open(f'../models/{VERSION}/oof_preds.pkl', 'wb'))
pickle.dump(y_oof, open(f'../models/{VERSION}/y_oof.pkl', 'wb'))

In [None]:
%%time
if DATA == 'COMP':
    test_data = Data(x_test, y_test, mode='test')

## 1 thr

In [None]:
threshold = optimize_threshold(oof_preds, y_oof, step_size=0.001)
np.round(threshold, 3)

In [None]:
df = pd.DataFrame([score_questions(oof_preds, y_oof, questions=range(1, 18 + 1), thr=threshold)])
df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]] = np.round(df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]], 3)
display(df)

In [None]:
if not DEV and DATA == 'COMP':
    score, df, predictions = evaluate_test(
        test_data, y_test.set_index('session_id'), thr=threshold, n_bags=N_BAGS, n_folds=N_FOLDS, n_seeds=N_SEEDS)
    display(df)

In [None]:
x = pd.read_parquet('../data/processed/x_test.parquet')
x['event_name_name'] = x['event_name'] + '_' + x['name']
y = pd.read_parquet('../data/processed/y_test.parquet')
y = y[y['q18'] > -1].reset_index(drop=True)
x = x[x['session_id'].isin(y['session_id'])].reset_index(drop=True)
x = x.sort_values(['session_id', 'index']).reset_index(drop=True)
y = y.set_index('session_id').loc[x['session_id'].drop_duplicates()].reset_index()
test_data_202211 = Data(x, y, mode='test')
score, df, predictions = evaluate_test(
    test_data_202211, y.set_index('session_id'), thr=0.62, n_bags=N_BAGS, n_folds=N_FOLDS, n_seeds=N_SEEDS)
display(df)

## 1 thr per question

In [None]:
thrs = optimize_thresholds(oof_preds, y_oof, threshold)
pickle.dump(thrs, open(f'../models/{VERSION}/thresholds.pkl', 'wb'))
print([np.round(t, 3) for t in thrs])

In [None]:
df = pd.DataFrame([score_questions(oof_preds, y_oof, questions=range(1, 18 + 1), thr=thrs)])
df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]] = np.round(df.loc[:, [f'q{q}' for q in range(1, 18 + 1)]], 3)
display(df)

In [None]:
if not DEV and DATA == 'COMP':
    score, df, predictions = evaluate_test(
        test_data, y_test.set_index('session_id'), thr=thrs, n_bags=N_BAGS, n_folds=N_FOLDS, n_seeds=N_SEEDS)
    display(df)

## Overall

In [None]:
def score_all(predictions, targets, n_bags, n_thresholds=1):
    results = []
    for b in range(n_bags):
        oof_preds = predictions[[f'q{q}b{b}' for q in range(1, 18 + 1)]]
        oof_preds.columns = [f'q{q}' for q in range(1, 18 + 1)]
        thr = optimize_threshold(oof_preds, targets, step_size=0.001)
        if n_thresholds > 1:
            thr = optimize_thresholds(oof_preds, targets, thr)
        result = score_questions(oof_preds, targets, questions=range(1, 18 + 1), thr=thr)
        if n_thresholds == 1:
            result['thr'] = np.round(thr, 3)
        for q in range(1, 18 + 1):
            result[f'q{q}'] = np.round(result[f'q{q}'], 3)
        results.append(result)
    oof_preds = predictions[[f'q{q}' for q in range(1, 18 + 1)]]
    thr = optimize_threshold(oof_preds, targets, step_size=0.001)
    if n_thresholds > 1:
        thr = optimize_thresholds(oof_preds, targets, thr)
    result = score_questions(oof_preds, targets, questions=range(1, 18 + 1), thr=thr)
    if n_thresholds == 1:
        result['thr'] = np.round(thr, 3)
    for q in range(1, 18 + 1):
        result[f'q{q}'] = np.round(result[f'q{q}'], 3)
    results.append(result)
    index = [f'b{b}' for b in range(N_BAGS)] + [f'overall']
    display(pd.DataFrame(results, index=index))

In [None]:
score_all(oof_predictions, targets, N_BAGS)

In [None]:
score_all(oof_predictions, targets, N_BAGS, n_thresholds=18)

## Bags and seeds

In [None]:
def score_subparts(predictions, targets, folds, bags, seeds, n_folds=N_FOLDS, n_thresholds=1):
    preds = pd.DataFrame()
    for q in range(1, 18 + 1):
        pred = predictions[[f'q{q}b{b}s{s}' for s in seeds for b in bags]]
        preds[f'q{q}'] = expit(np.mean(logit(pred), axis=1)) if EXPIT else np.mean(pred, axis=1)
    thr = np.round(optimize_threshold(preds, targets, step_size=0.001), 3)
    if n_thresholds > 1:
        thr = optimize_thresholds(preds, targets, thr)
    return score_questions(preds, targets, questions=range(1, 18 + 1), thr=thr)['overall'], np.round(thr, 3)

In [None]:
score_subparts(oof_predictions, targets, folds=FOLDS, bags=range(N_BAGS), seeds=range(N_SEEDS), n_thresholds=1)

In [None]:
%%time
score_subparts(oof_predictions, targets, folds=FOLDS, bags=range(N_BAGS), seeds=range(N_SEEDS), n_thresholds=18)

# Explanation

In [None]:
importances['q1']['df'].sort_values('q1', ascending=False)

In [None]:
importances['q1']['df'].sort_values('q1', ascending=False).tail(50)

In [None]:
importances['q8']['df'][importances['q8']['df']['q8'] == 0]

In [None]:
importances['q15']['df'][importances['q15']['df']['q15'] == 0]