https://www.kaggle.com/riteshkrjha/riiid-quick-tabnet-vs-xgboost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, classification_report
import tensorflow as tf

from LorisBallsBasedModel import *

In [2]:
train = pd.read_csv('./data/riid/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [3]:
# Remove lectures and additional processing
train = train[train.content_type_id == False]

train = train.sort_values(['timestamp'],
                          ascending=True)
train.drop(['timestamp', 'content_type_id'],
           axis=1,
           inplace=True)

In [4]:
# Read Questions and Lectures
questions = pd.read_csv('./data/riid/questions.csv')
lectures = pd.read_csv('./data/riid/lectures.csv')

In [5]:
# Merge train with Questions
train = pd.merge(train,
                 questions,
                 left_on='content_id',
                 right_on='question_id',
                 how='left')

In [6]:
#Indicator for first question in a batch
train['firstQindicator'] = np.where(train['prior_question_elapsed_time'].isnull(),
                                    1,
                                    0)
train['prior_question_elapsed_time'] = np.where(train['prior_question_elapsed_time'].isnull(),
                                                0,
                                                train['prior_question_elapsed_time'])

In [7]:
train.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags,firstQindicator
0,115,5692,1,1,0.0,,5692,5692,3,5,151,1
1,1805962620,5547,0,0,0.0,,5547,5547,0,5,8,1
2,2015251289,4024,0,1,0.0,,4024,4024,0,5,173,1
3,867941388,6659,0,1,0.0,,6659,6659,3,5,53,1
4,867946278,3977,0,1,0.0,,3977,3977,2,5,177,1


In [8]:
# Remove unused columns
del train['question_id']
del train['bundle_id']
del train['correct_answer']
del train['tags']

In [9]:
import gc
gc.collect()

0

In [10]:
train.prior_question_had_explanation = train.prior_question_had_explanation.astype(object)
train.prior_question_had_explanation = np.where(train.prior_question_had_explanation == 'True',
                                                1,
                                                0)

In [11]:
# Sample 5M records
train = train.sample(n=5000000)

In [12]:
# train test split
xtrain, xvalid, ytrain, yvalid = train_test_split(train.drop(['answered_correctly'], axis=1), 
                                                  train['answered_correctly'],
                                                  random_state=42, 
                                                  test_size=0.2, 
                                                  shuffle=True)

In [13]:
# Train LorisBallsBasedModel
LBBM = LorisBallsBasedModel(nbr_steps=5,
                            first_step_args={'attentive_transformer': FirstAttentiveTransformer,
                                             'attentive_transformer_params_dict': {'dropout_rate': 0.,
                                                                                   'regularizer': tf.keras.regularizers.L1(0.),
                                                                                   'entropy_weight': 0.,
                                                                                  },
                                             'features_outputs_units': 16,
                                             'features_pass_next_step_units': 4,
                                            },
                            step_args={'attentive_transformer': AttentiveTransformer,
                                       'attentive_transformer_params_dict': {'gamma': 1.,
                                                                             'dropout_rate': 0.,
                                                                             'regularizer': tf.keras.regularizers.L1(0.),
                                                                             'entropy_weight': 0.,
                                                                            },
                                       'features_outputs_units': 16,
                                       'features_pass_next_step_units': 4,
                                       'prior_outputs_units': 4,
                                      },
                            output_layer=tf.keras.layers.Dense(1, 'sigmoid'),
                            input_processing_layer=tf.keras.layers.BatchNormalization(),
                           )

LBBM.compile(loss=tf.keras.losses.binary_crossentropy,
             optimizer=tf.keras.optimizers.Adam(),
             metrics=['acc'])
train_tensor = tf.data.Dataset.from_tensor_slices((xtrain.values,
                                                   ytrain.values)).batch(10000)

import math
def step_decay(epoch):
    initial_lrate = 0.02
    drop = 0.5
    epochs_drop = 2.
    lrate = initial_lrate * math.pow(drop,  
                                     math.floor((1+epoch)/epochs_drop))
    return lrate
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(step_decay)
history = LBBM.fit(train_tensor,
                   epochs=12,
                   callbacks=[lr_scheduler],
                   verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [14]:
LBBM.masks_explain(tf.convert_to_tensor(xvalid.values[:1]))

[<tf.Tensor: shape=(1, 7), dtype=float32, numpy=array([[0., 0., 1., 0., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=
 array([[0.        , 0.        , 0.        , 0.56752956, 0.        ,
         0.4324704 , 0.        ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=array([[0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=array([[0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=array([[0., 0., 0., 0., 0., 1., 0.]], dtype=float32)>]

In [15]:
p = LBBM.predict(xvalid.values).flatten().round().astype(int)
print('\t\t\tCLASSIFICATIION METRICS: LBBM\n')
print(classification_report(yvalid, p))
score = roc_auc_score(yvalid, p)
print('ROC value is: {}'.format(score))

			CLASSIFICATIION METRICS: LBBM

              precision    recall  f1-score   support

           0       0.59      0.10      0.17    342084
           1       0.67      0.96      0.79    657916

    accuracy                           0.67   1000000
   macro avg       0.63      0.53      0.48   1000000
weighted avg       0.64      0.67      0.58   1000000

ROC value is: 0.53193064220985
