# Solution

In [134]:
KAGGLE_KERNEL = False

## Setup

In [135]:
import pathlib
import pandas as pd

data_dir = pathlib.Path('../input/riiid-test-answer-prediction/' if KAGGLE_KERNEL else 'data')

if data_dir.joinpath('train.pkl').exists():
    print('Loading .pkl')
    train = pd.read_pickle(data_dir.joinpath('train.pkl'))
    
else:

    print('Loading .csv')
    dtypes = {
        'row_id': 'int64',
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'user_answer': 'int8',
        'answered_correctly': 'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }
    train = pd.read_csv(
        'data/train.csv',
        index_col='row_id',
        dtype=dtypes
    )
    train.to_pickle('data/train.pkl')

train.head(5)

Loading .pkl


Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,115,5692,0,1,3,1,,
1,56943,115,5716,0,2,2,1,37000.0,False
2,118363,115,128,0,0,0,1,55000.0,False
3,131167,115,7860,0,3,0,1,19000.0,False
4,137965,115,7922,0,4,1,1,11000.0,False


The `task_container_id` variable is supposed to be monotonically increasing for each user. But that doesn't seem to be the case. For instance, see user 115. Therefore, I renumber the tasks to make sure they're monotonically increasing for each user.

In [136]:
train['task_container_id'] = train.groupby('user_id')['task_container_id'].transform(lambda x: pd.factorize(x)[0]).astype('int16')
train.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,115,5692,0,0,3,1,,
1,56943,115,5716,0,1,2,1,37000.0,False
2,118363,115,128,0,2,0,1,55000.0,False
3,131167,115,7860,0,3,0,1,19000.0,False
4,137965,115,7922,0,4,1,1,11000.0,False


We can now iterate over batches of the training data. The idea is that each batch is going to behave like the data that the `env.iter_test` function will yield in the Kaggle kernel. We will thus call each batch a "group" to adopt the same terminology.

In [228]:
def iter_groups(train):
    
    correct = {}
    
    for _, group in iter(train.groupby('task_container_id')):
    
        yield (
            group.drop(columns=['user_answer', 'answered_correctly']),
            correct
        )
        
        users = group.groupby('user_id', sort=False)
        correct = users['answered_correctly'].apply(list).to_dict()
        
groups = iter_groups(train[:10_000])
group, prev_correct = next(groups)
group.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,115,5692,0,0,,
46,0,124,7900,0,0,,
76,0,2746,5273,0,0,,
96,0,5382,5000,0,0,,
224,0,8623,3915,0,0,,


In [229]:
next_group, next_prev_correct = next(groups)
next_group.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,56943,115,5716,0,1,37000.0,False
47,32683,124,7876,0,1,26000.0,False
77,21592,2746,758,0,1,28000.0,False
97,39828,5382,3944,0,1,24000.0,False
225,38769,8623,4750,0,1,16000.0,False


In [230]:
import itertools

for user in itertools.islice(next_prev_correct, 5):
    print(user, next_prev_correct[user])

115 [1]
124 [1]
2746 [0]
5382 [1]
8623 [1]


As you can see, this first group contains the first interaction of each user. The next group contains the second interaction, along with the correctness information for the first group.

The goal is now to build stateful feature extractors. Each such feature extractor should provide the ability to produce features for each row in a group. The feature extractor should then be able to update itself with the new information provided by the group. Here is the interface:

In [240]:
import abc
import re

class Extractor(abc.ABC):
    
    @abc.abstractmethod
    def transform(self, group):
        pass
    
class SupervisedExtractor(Extractor):
    
    @abc.abstractmethod
    def partial_fit(self, group, prev_correct):
        pass

By doing things this way, we'll be able to apply the same code for both training and testing. More code reuse means less bugs, at least in my book.

## Feature extraction

In [241]:
import statistics

def update_avg(avg, n, new):
    m = len(new)
    n += m
    return avg + m * (statistics.mean(new) - avg) / n, n

In [242]:
import collections

class AvgCorrect(SupervisedExtractor):
    
    def __init__(self):
        self.avgs = collections.defaultdict(lambda: (.6, 20))
    
    def partial_fit(self, group, prev_correct):
        for user_id, correct in prev_correct.items():
             self.avgs[user_id] = update_avg(*self.avgs[user_id], correct)
    
    def transform(self, group):
        return pd.Series(
            (self.avgs[user_id][0] for user_id in group.user_id),
            name='avg_correct',
            index=group.index
        )
    
extractor = AvgCorrect()
extractor.partial_fit(next_group, next_prev_correct)
extractor.transform(next_group).head()

row_id
1      0.619048
47     0.619048
77     0.571429
97     0.619048
225    0.619048
Name: avg_correct, dtype: float64

In [260]:
prev_correct

{24418: [1]}

In [None]:
class AvgCorrect(SupervisedExtractor):
    
    def __init__(self):
        self.avgs = collections.defaultdict(lambda: (.6, 20))
    
    def partial_fit(self, group, prev_correct):
        for user_id, correct in prev_correct.items():
             self.avgs[user_id] = update_avg(*self.avgs[user_id], correct)
    
    def transform(self, group):
        return pd.Series(
            (self.avgs[user_id][0] for user_id in group.user_id),
            name='avg_correct',
            index=group.index
        )

Now let's extract features for the training set.

In [259]:
%%prun

import chime
import tqdm

extractors = ExtractorUnion(
    AvgCorrect()
)

with open('data/train_features.csv', 'w') as out:

    for i, (group, prev_correct) in tqdm.tqdm(enumerate(iter_groups(train[:10000])), total=10_000, position=0):    
        features = extractors.transform(group)
        extractors.partial_fit(group, prev_correct)
        
        if i == 0:
            features.to_csv(out)
        else:
            features.to_csv(out, mode='a', header=False)
            
chime.success()

 49%|████▉     | 4890/10000 [00:16<00:17, 296.38it/s]


 

         25334458 function calls (25079487 primitive calls) in 16.543 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
4892130/4887240    0.953    0.000    1.583    0.000 {built-in method builtins.isinstance}
48901/24451    0.497    0.000    3.036    0.000 base.py:289(__new__)
  1711577    0.416    0.000    0.579    0.000 generic.py:10(_check)
     9781    0.292    0.000    0.516    0.000 managers.py:228(_rebuild_blknos_and_blklocs)
234736/220066    0.290    0.000    0.343    0.000 {built-in method numpy.array}
  2596733    0.283    0.000    0.312    0.000 {built-in method builtins.getattr}
    63582    0.277    0.000    0.277    0.000 {method 'reduce' of 'numpy.ufunc' objects}
     4890    0.251    0.000    2.411    0.000 managers.py:1267(_slice_take_blocks_ax0)
    39129    0.240    0.000    1.512    0.000 algorithms.py:1586(take_nd)
   303221    0.238    0.000    0.593    0.000 common.py:1460(is_extension_array_dtype)
   