In [None]:
! pip install /kaggle/input/treelite-240/treelite-2.4.0-py3-none-manylinux2014_x86_64.whl --disable-pip-version-check

In [None]:
! pip install /kaggle/input/treelite-240/treelite_runtime-2.4.0-py3-none-manylinux2014_x86_64.whl --disable-pip-version-check

In [None]:
PATH = "/kaggle/input"
THRESHOLD = 0.625
EXPIT = True
LEVEL_GROUPS = ['0-4', '5-12', '13-22']
LENGTHS = {'0-4': 600, '5-12': 1400, '13-22': 2000}
DISCRETE_FEATURES = ['room_fqid', 'event_name_name', 'text', 'fqid', 'coor_x', 'coor_y', 'page']
CONTINUOUS_FEATURES = ['duration', 'hover_duration']
FEATURES = DISCRETE_FEATURES + CONTINUOUS_FEATURES
FEATURES_5 = ['room_fqid', 'event_name_name', 'text', 'fqid', 'duration']

In [None]:
MODELS = [
    {
        'path': f'{PATH}/cpmp-predict-student-xgb-10-folds/pspfgp-46-xgb-bags-10-folds',
        'n_bags': 4,
        'n_seeds': 1,
        'n_folds': 10,
        'weight': 0.5,
    }, {
        'path': f'{PATH}/pspfgp-49-dataset/pspfgp-46-nn-simple-head',
        'n_bags': 1,
        'n_seeds': 4,
        'n_folds': 5,
        'weight': 0.15,
    }, {
        'path': f'{PATH}/pspfgp-49-dataset/pspfgp-47-nn-simple-head',
        'n_bags': 1,
        'n_seeds': 4,
        'n_folds': 5,
        'weight': 0.2,
    }, {
        'path': f'{PATH}/pspfgp-49-dataset/pspfgp-48-nn-simple-head',
        'n_bags': 1,
        'n_seeds': 4,
        'n_folds': 5,
        'weight': 0.15,
    },
]

In [None]:
import numpy as np
import pandas as pd
import pickle
import polars as pl
import re
if EXPIT:
    from scipy.special import expit, logit
import tensorflow as tf
import treelite
import treelite_runtime
from xgboost import XGBClassifier

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
def get_levels(level_group, within_level_group=True):
    level_group_split = level_group.split("-")
    level_group_min = int(level_group_split[0]) if within_level_group else 0
    level_group_max = int(level_group_split[1])
    return [i for i in range(level_group_min, level_group_max + 1)]

In [None]:
def get_questions(level_group):
    return ([1, 2, 3] if level_group == '0-4' 
            else [4, 5, 6, 7, 8, 9, 10, 11, 12, 13] if level_group == '5-12' 
            else [14, 15, 16, 17, 18])

In [None]:
def clean_feature_name(feature_name):
    return re.sub('[^A-Za-z0-9_]', '_', str(feature_name))

In [None]:
class AggsBuilder:
    def __init__(self, collections):
        self.collections = collections
        self.aggs = []
        
    def doc(self):
        to_exclude = ['collections', 'aggs']
        return [el for el in dir(self) if '__' not in el and el not in to_exclude]
        
    def collect(self):
        aggs = self.aggs
        self.aggs = []
        return aggs
    
    def clear(self):
        self.aggs = []
        
    def add(self, *aggregations):
        self.aggs.extend([*aggregations])
        
    def add_durations(self, level_group):
        fqids = self.collections[level_group]['fqids']
        room_fqids = self.collections[level_group]['room_fqids']
        event_name_names = self.collections[level_group]['event_name_names']
        texts = self.collections[level_group]['texts']
        levels = get_levels(level_group)
        activities = self.collections[level_group]['activities']
        pages = self.collections[level_group]['pages']
        
        text_root = pl.col('duration').filter(~pl.col('text').is_null())
        
        self.add(pl.col('duration').drop_nulls().sum()
                 .alias(f'level_group_{level_group}_duration_sum'))
        
        self.add(*[pl.col('duration').filter(pl.col('room_fqid') == r).sum()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_duration_sum') for r in room_fqids])
        
        self.add(*[pl.col('duration').filter(pl.col('fqid') == f).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_duration_sum') for f in fqids])
        
        self.add(*[pl.col('duration').filter(pl.col('text') == t).sum()
                   .alias(f'level_group_{level_group}_text_{t}_duration_sum') for t in texts])
        
        self.add(*[pl.col('duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_activity_{f}_duration_sum') for f in activities])
        
        self.add(*[pl.col('hover_duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_hover_duration_sum') for f in fqids])

        self.add(*[pl.col('duration').filter(pl.col('event_name_name') == e).sum()
                   .alias(f'level_group_{level_group}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names])

        self.add(*[pl.col('duration').filter((pl.col('level') == l) & (pl.col('event_name_name') == e)).sum()
                   .alias(f'level_{l}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names for l in levels])

        self.add(*[pl.col('duration').filter((pl.col('room_fqid') == r) & (pl.col('event_name_name') == e)).sum()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_event_name_name_{e}_duration_sum') 
                   for e in event_name_names for r in room_fqids])
        
        self.add(*[(pl.col('elapsed_time').filter(pl.col('fqid').str.contains(f)).max() - 
                    pl.col('elapsed_time').filter(pl.col('fqid').str.contains(f)).min())
                   .alias(f'level_group_{level_group}_activity_{f}_elapsed_time_diff') for f in activities])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('fqid') == f).sum()
                   .alias(f'level_group_{level_group}_fqid_{f}_prev_duration_sum') for f in fqids])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('text') == t).sum()
                   .alias(f'level_group_{level_group}_text_{t}_prev_duration_sum') for t in texts])
        
        self.add(*[pl.col('prev_duration').filter(pl.col('fqid').str.contains(f)).sum()
                   .alias(f'level_group_{level_group}_activity_{f}_prev_duration_sum') for f in activities])

        return self
    
    def add_counts(self, level_group):
        fqids = self.collections[level_group]['fqids']
        room_fqids = self.collections[level_group]['room_fqids']
        event_name_names = self.collections[level_group]['event_name_names']
        texts = self.collections[level_group]['texts']
        levels = get_levels(level_group)
        activities = self.collections[level_group]['activities']
        pages = self.collections[level_group]['pages']
        text_fqids = self.collections[level_group]['text_fqids']
        
        self.add(pl.col('index').count()
                 .alias(f'level_group_{level_group}_cnt'))
        
        self.add(*[pl.col('index').filter(pl.col('room_fqid') == r).count()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_cnt') for r in room_fqids])
        
        self.add(*[pl.col('index').filter(pl.col('fqid').str.contains(f)).count()
                   .alias(f'level_group_{level_group}_activity_{f}_cnt') for f in activities])
        
        self.add(*[pl.col('index').filter(pl.col('level') == l).count()
                   .alias(f'level_{l}_cnt') for l in levels])
        
        self.add(*[pl.col('index').filter(pl.col('fqid') == f).count()
                   .alias(f'level_group_{level_group}_fqid_{f}_cnt') for f in fqids])
        
        self.add(*[pl.col('index').filter(pl.col('text_fqid') == t).count()
                   .alias(f'level_group_{level_group}_text_fqid_{t}_cnt') for t in text_fqids])
        
        self.add(*[pl.col('index').filter(pl.col('page') == p).count()
                   .alias(f'level_group_{level_group}_page_{p}_cnt') for p in pages])
        
        self.add(*[pl.col('index').filter(pl.col('event_name_name') == e).count()
                   .alias(f'level_group_{level_group}_event_name_name_{e}_cnt') for e in event_name_names])
        
        self.add(*[pl.col('index').filter((pl.col('event_name_name') == e) & (pl.col('level') == l)).count()
                   .alias(f'level_group_{level_group}_level_{l}_event_name_name_{e}_cnt') 
                   for e in event_name_names for l in levels])
        
        self.add(*[pl.col('index').filter((pl.col('event_name_name') == e) & (pl.col('room_fqid') == r)).count()
                   .alias(f'level_group_{level_group}_room_fqid_{r}_event_name_name_{e}_cnt') 
                   for e in event_name_names for r in room_fqids])
        
        return self
    
    def add_mouse(self, level_group):
        filter_condition = (pl.col('event_name') == 'object_click') & (pl.col('name') == 'basic')
        activities = self.collections[level_group]['activities']
        
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).mean()
                   .alias(f'level_group_{level_group}_{f}_room_coor_x_mean') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).std()
                   .alias(f'level_group_{level_group}_{f}_room_coor_x_std') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).mean()
                   .alias(f'level_group_{level_group}_{f}_room_coor_y_mean') for f in activities])
        self.add(*[pl.col('room_coor_x').filter(filter_condition & pl.col('fqid').str.contains(f)).std()
                   .alias(f'level_group_{level_group}_{f}_room_coor_y_std') for f in activities])
        
        return self
        
    def add_notebook(self, level_group):
        self.add(*[pl.col('duration').filter((pl.col('level') == l) & 
                                             ((pl.col('event_name') == 'notebook_click') & 
                                              (pl.col('name') != 'close'))).sum()
                   .alias(f'level_{l}_notebook_duration_sum') for l in get_levels(level_group)])
        return self

    def add_globals(self, level_group):
        return self

In [None]:
def engineer_gbdt(x_raw, session_data, level_group, aggs_builder):
    aggs = (
        aggs_builder
            .add_durations(level_group)
            .add_counts(level_group)
            .add_mouse(level_group)
            .add_notebook(level_group)
            .collect()
    )
    columns = [((pl.col('elapsed_time').shift(-1) - pl.col('elapsed_time')).fill_null(0)
                .over(['session_id', 'level_group'])
                .alias('duration')),
               ((pl.col('elapsed_time').shift(1) - pl.col('elapsed_time')).fill_null(0)
                .over(['session_id', 'level_group'])
                .alias('prev_duration'))]
    x_lg = (pl.from_pandas(x_raw)
            .lazy().drop(['fullscreen', 'hq', 'music']).with_columns(columns)
            .groupby('session_id', maintain_order=True).agg(aggs)
            .sort('session_id').collect().to_pandas())
    x_lg.columns = [clean_feature_name(c) for c in x_lg.columns]
    
    x_lg = x_lg.fillna(-999999)
        
    if level_group == '0-4':
        return x_lg
    
    return pd.concat([session_data, x_lg.drop(['session_id'], axis=1)], axis=1)

In [None]:
def build_sequence(df, n_features, length):
    return (np.vstack([df.values, np.zeros((length - len(df), n_features))]) 
            if len(df) < length else df[:length].values)

In [None]:
def engineer_nn(x, length, tokenizer_map, features):
    x['coor_x'] = (((np.clip(x['room_coor_x'], -2000, 2000) + 2000) / 4000).fillna(0) * 50).astype(int)
    x['coor_y'] = (((np.clip(x['room_coor_y'], -1000, 1000) + 1000) / 2000).fillna(0) * 50).astype(int)

    next_elapsed_time = x['elapsed_time'].shift(-1)
    x['duration'] = next_elapsed_time - x['elapsed_time']
    x['duration'] = x['duration'].fillna(60000)
    x['duration'] = np.clip(x['duration'], 0, 60000) / 60000
    
    x['hover_duration'] = x['hover_duration'].fillna(0)
    x['hover_duration'] = np.clip(x['hover_duration'], 0, 60000) / 60000
    
    for feature in DISCRETE_FEATURES:
        if feature in features:
            encoder = {value: token for token, value in enumerate(tokenizer_map[feature]['encode'])}
            x[feature] = np.where(~x[feature].isin(tokenizer_map[feature]['encode']), np.nan, x[feature])
            x[feature] = x[feature].map(encoder)
            x[feature] = x[feature].fillna(0)
    
    x = build_sequence(x[features], len(features), length)
    x = x.T
    x = x.astype(np.float32)
    x = np.expand_dims(x, axis=1)
    
    return x

In [None]:
class Model:
    def __init__(self, path, n_bags, n_seeds, n_folds, weight):
        self.path = path
        self.type = path.split('/')[-1].split('-')[2]
        self.n_bags = n_bags
        self.n_seeds = n_seeds
        self.n_folds = n_folds
        self.weight = weight
        self.models = {}
        self.build()
        
    def build(self):
        if self.type == 'xgb' or self.type == 'lgb':
            self.features = pickle.load(open(f'{self.path}/features.pkl', 'rb'))
            for q in range(1, 18 + 1):
                for b in range(self.n_bags):
                    for f in range(self.n_folds):
                        for s in range(self.n_seeds):
                            path = f'{self.path}/q{q}b{b}f{f}s{s}'
                            if self.type == 'xgb':
                                model = treelite.Model.load(f'{path}.xgb', model_format='xgboost')
                            elif self.type == 'lgb':
                                model = treelite.Model.load(f'{path}.lgb', model_format='lightgbm')
                            self.models[path.split('/')[-1]] = model
            
        elif self.type == 'convnet' or self.type == 'nn':
            self.embeddings = {}
            self.info = {}
            for level_group in LEVEL_GROUPS:
                self.models[level_group] = {}
                self.info[level_group] = {}
                for s in range(self.n_seeds):
                    self.models[level_group][s] = {}
                    self.info[level_group][s] = {}
                    for f in range(self.n_folds):
                        suffix = f'_{level_group.replace("-", "_")}_b0f{f}'
                        convnet_interpreter = tf.lite.Interpreter(model_path=f'{self.path}-s{s}/convnet{suffix}.tflite')
                        convnet_interpreter.allocate_tensors()
                        head_interpreter = tf.lite.Interpreter(model_path=f'{self.path}-s{s}/head{suffix}.tflite')
                        head_interpreter.allocate_tensors()
                        self.models[level_group][s][f'f{f}'] = {}
                        self.info[level_group][s]['input_details']  = {}
                        self.info[level_group][s]['output_details']  = {}
                        self.models[level_group][s][f'f{f}']['convnet'] = convnet_interpreter
                        self.models[level_group][s][f'f{f}']['head'] = head_interpreter
                        input_details = convnet_interpreter.get_input_details()
                        output_details = convnet_interpreter.get_output_details()
                        self.info[level_group][s]['input_details']['convnet'] = input_details
                        self.info[level_group][s]['output_details']['convnet'] = output_details
                        self.info[level_group][s]['features'] = [
                            (input_detail['name']
                             .replace('serving_default_input_', '')
                             .replace(f"_{level_group.replace('-', '_')}:0", '')) 
                            for input_detail in self.info[level_group][s]['input_details']['convnet']]
                        self.info[level_group][s]['input_details']['head'] = head_interpreter.get_input_details()
                        self.info[level_group][s]['output_details']['head'] = head_interpreter.get_output_details()            
        
    def predict(self, session_id, level_group, x_gbdt, x_nn):
        if self.type == 'xgb' or self.type == 'lgb':
            preds = []
            for q in get_questions(level_group):
                question_preds = []
                for b in range(self.n_bags):
                    for f in range(self.n_folds):
                        for s in range(self.n_seeds):
                            model_name = f'q{q}b{b}f{f}s{s}'
                            data = x_gbdt[self.features[f'q{q}']].astype(np.float32).values
                            pred = treelite.gtil.predict(self.models[model_name], data=data)
                            question_preds.append(pred)
                question_preds = expit(np.mean(logit(question_preds))) if EXPIT else np.mean(question_preds)
                preds.append(question_preds)
            preds = np.array(preds)
            return preds
        
        elif self.type == 'convnet' or self.type == 'nn':
            feature_map = {f: i for i, f in enumerate(FEATURES_5 if '48' in model.path else FEATURES)}
            
            if level_group == '0-4':
                self.embeddings[session_id] = {}
                for l in LEVEL_GROUPS:
                    self.embeddings[session_id][l] = {}
                    for s in range(self.n_seeds):
                        self.embeddings[session_id][l][s] = {}
                    
            preds = []
            for b in range(self.n_bags):
                for f in range(self.n_folds):
                    fold_preds = []
                    for s in range(self.n_seeds):
                        x = x_nn.astype(np.float32)
                        convnet_interpreter = self.models[level_group][s][f'f{f}']['convnet']
                        features = self.info[level_group][s]['features']
                        input_details = self.info[level_group][s]['input_details']['convnet']
                        output_details = self.info[level_group][s]['output_details']['convnet']  
                        for i, feature in enumerate(features):
                            convnet_interpreter.set_tensor(input_details[i]['index'], x[feature_map[feature]])
                        convnet_interpreter.invoke()
                        x = convnet_interpreter.get_tensor(output_details[0]['index'])

                        if level_group != '13-22':
                            self.embeddings[session_id][level_group][s][f'f{f}'] = x
                        if level_group == '5-12':
                            x = np.concatenate([self.embeddings[session_id]['0-4'][s][f'f{f}'], x], axis=1)
                        elif level_group == '13-22':
                            emb_04 = self.embeddings[session_id]['0-4'][s][f'f{f}']
                            emb_512 = self.embeddings[session_id]['5-12'][s][f'f{f}']
                            x = np.concatenate([emb_04, emb_512, x], axis=1)

                        head_interpreter = self.models[level_group][s][f'f{f}']['head']
                        input_details = self.info[level_group][s]['input_details']['head']
                        output_details = self.info[level_group][s]['output_details']['head']  
                        head_interpreter.set_tensor(input_details[0]['index'], x)
                        head_interpreter.invoke()
                        p = head_interpreter.get_tensor(output_details[0]['index'])
                        fold_preds.append(p)
                    fold_preds = np.mean(fold_preds, axis=0)[0]
                    preds.append(fold_preds)
                preds = np.mean(preds, axis=0)
                return preds

In [None]:
models = [Model(m['path'], m['n_bags'], m['n_seeds'], m['n_folds'], m['weight']) for m in MODELS]
aggs_builder = AggsBuilder(pickle.load(open(f'{PATH}/pspfgp-49-dataset/collections.pkl', 'rb')))
tokenizer_map = pickle.load(open(f'{PATH}/pspfgp-49-dataset/tokenizer_map.pkl', 'rb'))
gbdt_data = {}

In [None]:
for test, sample_submission in iter_test:
    sample_submission['question'] = [int(label.split('_')[1][1:]) for label in sample_submission['session_id']]
    sample_submission = sample_submission.sort_values('question').reset_index(drop=True)
    
    session_id = test.iloc[0]['session_id']
    level_group = test.iloc[0]['level_group']
    
    if level_group == '0-4':
        gbdt_data[session_id] = None
        
    test = test.sort_values('index').reset_index(drop=True)
    test['event_name_name'] = test['event_name'] + '_' + test['name']
    
    x_gbdt = engineer_gbdt(test, gbdt_data[session_id], level_group, aggs_builder)
    x_nn = engineer_nn(test, LENGTHS[level_group], tokenizer_map, FEATURES)
    x_nn_5 = engineer_nn(test, LENGTHS[level_group], tokenizer_map, FEATURES_5)
    
    if level_group == '0-4' or level_group == '5-12':
        gbdt_data[session_id] = x_gbdt
    
    preds = []
    for model in models:
        p = model.predict(session_id, level_group, x_gbdt, x_nn_5 if '48' in model.path else x_nn)
        w = model.weight
        preds.append(w * p)
    preds = np.sum(preds, axis=0)
    
    sample_submission['correct'] = 1 * (preds > THRESHOLD)
    
    env.predict(sample_submission[['session_id', 'correct']])

In [None]:
! head -n 55 submission.csv