# Feature extraction

## Setup

Let's start by loading the training data.

In [13]:
import pathlib
import pandas as pd

data_dir = pathlib.Path('data')

if data_dir.joinpath('train.pkl').exists():
    print('Loading .pkl')
    train = pd.read_pickle(data_dir.joinpath('train.pkl'))
    
else:

    print('Loading .csv')
    dtypes = {
        'row_id': 'int64',
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'user_answer': 'int8',
        'answered_correctly': 'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }
    train = pd.read_csv(
        data_dir.joinpath('train.csv'),
        index_col='row_id',
        dtype=dtypes
    )
    
    # The `task_container_id` variable is supposed to be monotonically increasing for each user.
    # But that doesn't seem to be the case. For instance, see user 115.
    # Therefore, I renumber the tasks to make sure they're monotonically increasing for each user.
    train['task_container_id'] = train.groupby('user_id')['task_container_id'].transform(lambda x: pd.factorize(x)[0]).astype('int16')
    
    train.to_pickle(data_dir.joinpath('train.pkl'))

train.head(5)

Loading .pkl


Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,115,5692,0,0,3,1,,
1,56943,115,5716,0,1,2,1,37000.0,False
2,118363,115,128,0,2,0,1,55000.0,False
3,131167,115,7860,0,3,0,1,19000.0,False
4,137965,115,7922,0,4,1,1,11000.0,False


We can now iterate over batches of the training data. The idea is that each batch is going to behave like the data that the `env.iter_test` function will yield in the Kaggle kernel. We will thus call each batch a "group" to adopt the same terminology.

In [2]:
def iter_groups(train):
    
    prev_group = pd.DataFrame()
    
    for _, group in iter(train.groupby('task_container_id')):
        yield group.query('content_type_id == 0'), prev_group
        prev_group = group
        
groups = iter_groups(train[:10_000])
questions, prev_group = next(groups)
questions.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,118363,115,128,0,0,0,1,55000.0,False
46,0,124,7900,0,0,0,1,,
76,0,2746,5273,0,0,1,0,,
96,0,5382,5000,0,0,0,1,,
224,0,8623,3915,0,0,3,1,,


In [3]:
prev_group

In [4]:
next_questions, next_prev_group = next(groups)
next_questions.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,115,5692,0,1,3,1,,
47,32683,124,7876,0,1,0,0,26000.0,False
77,21592,2746,758,0,1,0,0,28000.0,False
97,39828,5382,3944,0,1,1,0,24000.0,False
225,38769,8623,4750,0,1,1,1,16000.0,False


In [5]:
next_prev_group.head()

Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,118363,115,128,0,0,0,1,55000.0,False
46,0,124,7900,0,0,0,1,,
76,0,2746,5273,0,0,1,0,,
96,0,5382,5000,0,0,0,1,,
224,0,8623,3915,0,0,3,1,,


As you can see, this first group contains the first interaction of each user. The next group contains the second interaction, along with the correctness information for the first group.

The goal is now to build stateful feature extractors. Each such feature extractor should provide the ability to produce features for each row in a group. The feature extractor should then be able to update itself with the new information provided by the group. Here is the interface:

In [106]:
import abc

class Extractor(abc.ABC):
    
    def __str__(self):
        return self.__class__.__name__
    
    @abc.abstractmethod
    def transform(self, questions):
        pass
    
class StatefulExtractor(Extractor):
    
    @abc.abstractmethod
    def update(self, questions, prev_group):
        pass

## Average correctness in the past

In [56]:
class AvgCorrect(StatefulExtractor):
    
    def __init__(self, prior_mean, prior_size):
        self.prior_mean = prior_mean
        self.prior_size = prior_size
        self.stats = pd.DataFrame(columns=['mean', 'size'])
        
    def __str__(self):
        return f'{self.__class__.__name__}_prior_mean={self.prior_mean}_prior_size={self.prior_size}'
    
    def update(self, questions, prev_group):
        
        # Initialize statistics for new users
        new = pd.Index(questions['user_id']).difference(self.stats.index)
        if len(new) > 0:
            prior = pd.DataFrame(
                {'mean': self.prior_mean, 'size': self.prior_size},
                index=new
            )
            self.stats = self.stats.append(prior)
        
        # Nothing to do if nothing happened before
        if len(prev_group) == 0:
            return
        
        # Compute the new statistics
        stats = (
            prev_group
            .query('content_type_id == 0')
            .groupby('user_id')['answered_correctly']
            .agg(['mean', 'size'])
        )
        
        # Update the old statistics with the new statistics
        users = stats.index
        m = stats.loc[users, 'size']
        self.stats.loc[users, 'size'] += m
        n = self.stats.loc[users, 'size']
        avg = self.stats.loc[users, 'mean']
        new_avg = stats.loc[users, 'mean']
        self.stats.loc[users, 'mean'] += m * (new_avg - avg) / n
    
    def transform(self, questions):
        avgs = self.stats.loc[questions['user_id'], 'mean'].rename('avg_correct')
        avgs.index = questions.index
        return avgs
    
extractor = AvgCorrect(.6, 10)
extractor.update(next_questions, next_prev_group)
extractor.transform(next_questions).head()

row_id
0      0.636364
47     0.636364
77     0.545455
97     0.636364
225    0.636364
Name: avg_correct, dtype: object

## Question difficulty

In [8]:
stats = train[:1000].query('content_type_id == 0').groupby('content_id')['answered_correctly'].agg(['mean', 'size'])
stats.head()

Unnamed: 0_level_0,mean,size
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.0,1
4,0.0,1
6,1.0,1
9,0.0,1
18,1.0,2


In [9]:
bayes_mean = stats.eval('(mean * size + .6 * 100) / (size + 100)')
bayes_mean.head()

content_id
2     0.594059
4     0.594059
6     0.603960
9     0.594059
18    0.607843
dtype: float64

In [71]:
class QuestionDifficulty(Extractor):
    
    def __init__(self, train):
        stats = train.query('content_type_id == 0').groupby('content_id')['answered_correctly'].agg(['mean', 'size'])
        self.bayes_avg = stats.eval('(mean * size + .6 * 100) / (size + 100)').rename('question_difficulty')
    
    def transform(self, questions):
        avgs = self.bayes_avg.loc[questions['content_id']]
        avgs.index = questions.index
        return avgs

## Extracting features for the training set

In [11]:
import pickle
import chime
import tqdm

extractors = [
    AvgCorrect(.6, 20),
    QuestionDifficulty(train)
]

# We filter out the extractors that have already been run
extractors = [
    extractor
    for extractor in extractors
    if not pathlib.Path(f'train_features/{extractor}.csv').exists()
]

for i, (questions, prev_group) in tqdm.tqdm(enumerate(iter_groups(train)), total=10_000, position=0):
    
    for extractor in extractors:

        if isinstance(extractor, StatefulExtractor):
            extractor.update(questions, prev_group)
        features = extractor.transform(questions)

        path = f'train_features/{extractor}.csv'
        if i == 0:
            features.to_csv(path)
        else:
            features.to_csv(path, mode='a', header=False)

# We save the extractors so that we can reuse them during the testing phase
for extractor in extractors:
    with open(f'extractors/{extractor}.pkl', 'wb') as f:
        pickle.dump(extractor, f)
            
chime.success()

100%|██████████| 10000/10000 [15:42<00:00, 10.61it/s]
