In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score,roc_curve,classification_report
import tensorflow as tf

#tf.compat.v1.disable_eager_execution()

from LorisBallsBasedModel import *

https://www.kaggle.com/riteshkrjha/riiid-quick-tabnet-vs-xgboost

In [2]:
train = pd.read_csv('./data/riid/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [3]:
# Remove lectures and additional processing
train = train[train.content_type_id == False]

train = train.sort_values(['timestamp'], ascending=True)
train.drop(['timestamp','content_type_id'], axis=1, inplace=True)

In [4]:
# Read Questions and Lectures
questions = pd.read_csv('./data/riid/questions.csv')
lectures = pd.read_csv('./data/riid/lectures.csv')

In [5]:
# Merge train with Questions
train = pd.merge(train, questions, left_on = 'content_id', right_on = 'question_id', how = 'left')

In [6]:
#Indicator for first question in a batch
train['firstQindicator'] = np.where(train['prior_question_elapsed_time'].isnull(),1,0)
train['prior_question_elapsed_time'] = np.where(train['prior_question_elapsed_time'].isnull(),
                                                0,train['prior_question_elapsed_time'])

In [7]:
train.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags,firstQindicator
0,115,5692,1,1,0.0,,5692,5692,3,5,151,1
1,1805962620,5547,0,0,0.0,,5547,5547,0,5,8,1
2,2015251289,4024,0,1,0.0,,4024,4024,0,5,173,1
3,867941388,6659,0,1,0.0,,6659,6659,3,5,53,1
4,867946278,3977,0,1,0.0,,3977,3977,2,5,177,1


In [8]:
# Remove unused columns
del train['question_id']
del train['bundle_id']
del train['correct_answer']
del train['tags']

In [9]:
import gc
gc.collect()

0

In [10]:
train.prior_question_had_explanation = train.prior_question_had_explanation.astype(object)
train.prior_question_had_explanation = np.where(train.prior_question_had_explanation=='True',1,0)

In [11]:
# Sample 5M records
train = train.sample(n=5000000)

In [12]:
# train test split
xtrain, xvalid, ytrain, yvalid = train_test_split(train.drop(['answered_correctly'],axis=1), 
                                                  train['answered_correctly'],
                                                  random_state=42, 
                                                  test_size=0.2, 
                                                  shuffle=True)

In [44]:
"""@tf.function
def graph():
    return LorisBallsBasedModel(nbr_steps=4,
                                first_step_args={'attentive_transformer': FirstAttentiveTransformer,
                                                 'attentive_transformer_params_dict': {'dropout_rate': 0.05},
                                                 'features_outputs_units': 6,
                                                 'features_pass_next_step_units': 3,
                                                },
                                step_args={'attentive_transformer': AttentiveTransformer,
                                           'attentive_transformer_params_dict': {'gamma': 0.1,
                                                                                 'dropout_rate': 0.05},
                                           'features_outputs_units': 6,
                                           'features_pass_next_step_units': 3,
                                           'prior_outputs_units': 3,
                                          },
                                output_layer=tf.keras.layers.Dense(1, 'sigmoid'),
                                input_processing_layer=tf.keras.layers.BatchNormalization(),
                               )
LBBM = graph()"""
LBBM = LorisBallsBasedModel(nbr_steps=4,
                            first_step_args={'attentive_transformer': FirstAttentiveTransformer,
                                             'attentive_transformer_params_dict': {'dropout_rate': .05,
                                                                                   'regularizer': tf.keras.regularizers.L1(0.00),
                                                                                   'entropy_weight': 0.,
                                                                                   'activation': tf.keras.activations.get('sigmoid')
                                                                                  },
                                             'features_outputs_units': 8,
                                             'features_pass_next_step_units': 3,
                                            },
                            step_args={'attentive_transformer': AttentiveTransformer,
                                       'attentive_transformer_params_dict': {'gamma': 1.5,
                                                                             'dropout_rate': .05,
                                                                             'regularizer': tf.keras.regularizers.L1(0.00),
                                                                             'entropy_weight': 0.,
                                                                             'activation': tf.keras.activations.get('sigmoid')
                                                                            },
                                       'features_outputs_units': 8,
                                       'features_pass_next_step_units': 3,
                                       'prior_outputs_units': 3,
                                      },
                            output_layer=tf.keras.layers.Dense(1, 'sigmoid'),
                            input_processing_layer=tf.keras.layers.BatchNormalization(),
                           )

In [87]:
LBBM.compile(loss=tf.keras.losses.binary_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=0.0000001), metrics=['acc'])

In [88]:
xtrain.values.shape

(4000000, 7)

In [89]:
ytrain.values.shape

(4000000,)

In [90]:
train_tensor = tf.data.Dataset.from_tensor_slices((xtrain.values,
                                                   ytrain.values)).batch(10000)

In [91]:
history = LBBM.fit(train_tensor, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
gamma=0.1 -> 400/400 [==============================] - 65s 156ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6398 - acc: 0.6527
gamma=0.9 -> 400/400 [==============================] - 62s 149ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6379 - acc: 0.6569
gamma=1 -> 400/400 [==============================] - 76s 182ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6383 - acc: 0.6561
gamma=1.1 -> 400/400 [==============================] - 91s 216ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6380 - acc: 0.6570
gamma=2 -> 400/400 [==============================] - 93s 220ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6546 - acc: 0.6354
=> gamma=1.1

entropy_weight=0. -> 400/400 [==============================] - 57s 136ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6395 - acc: 0.6573
entropy_weight=0.001 -> 400/400 [==============================] - 68s 162ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6452 - acc: 0.6468
entropy_weight=0.01 -> 400/400 [==============================] - 80s 191ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6583 - acc: 0.6470
entropy_weight=0.1 -> 400/400 [==============================] - 99s 237ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6841 - acc: 0.6506
=> entropy_weight=0.

drop_rate=0 -> 400/400 [==============================] - 98s 228ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6388 - acc: 0.6558
drop_rate=0.05 -> 400/400 [==============================] - 57s 138ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6377 - acc: 0.6568
drop_rate=0.1 -> 400/400 [==============================] - 75s 182ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6381 - acc: 0.6570
drop_rate=0.2 -> 400/400 [==============================] - 90s 215ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6396 - acc: 0.6557
drop_rate=0.5 -> 400/400 [==============================] - 103s 248ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6401 - acc: 0.6560
=> drop_rate=0.05

WITH LBB layers:
    No batchNorm:
400/400 [==============================] - 75s 181ms/step - batch: 199.5000 - size: 1.0000 - loss: 0.6387 - acc: 0.6553
    With batchNorm:
400/400 [==============================] - 236s 536ms/step - loss: 0.6557 - acc: 0.6457

SyntaxError: invalid syntax (Temp/ipykernel_2228/1882533749.py, line 1)

In [92]:
xvalid.values[:9]

array([[1.80850321e+09, 2.79000000e+02, 2.25000000e+02, 1.90000000e+04,
        0.00000000e+00, 2.00000000e+00, 0.00000000e+00],
       [1.33861174e+09, 5.26500000e+03, 1.60800000e+03, 9.20000000e+04,
        0.00000000e+00, 5.00000000e+00, 0.00000000e+00],
       [1.94689503e+09, 1.06080000e+04, 3.93500000e+03, 1.20000000e+04,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [4.99402949e+08, 6.44000000e+03, 7.17000000e+02, 1.00000000e+04,
        0.00000000e+00, 5.00000000e+00, 0.00000000e+00],
       [1.04603138e+09, 9.98200000e+03, 8.30000000e+01, 5.00000000e+03,
        0.00000000e+00, 5.00000000e+00, 0.00000000e+00],
       [1.95211298e+09, 3.09800000e+03, 6.60000000e+01, 3.46670000e+04,
        0.00000000e+00, 4.00000000e+00, 0.00000000e+00],
       [1.03253439e+09, 4.17100000e+03, 3.74800000e+03, 1.60000000e+04,
        0.00000000e+00, 5.00000000e+00, 0.00000000e+00],
       [1.03814973e+08, 4.80000000e+01, 2.99000000e+02, 1.60000000e+04,
        0.00000000e+00, 1

In [93]:
yvalid.values[:9]

array([0, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int8)

In [94]:
LBBM.predict(xvalid.values[:9])

array([[0.6977075 ],
       [0.62691635],
       [0.8743913 ],
       [0.61857045],
       [0.60645056],
       [0.6197764 ],
       [0.6250372 ],
       [0.7724538 ],
       [0.64981365]], dtype=float32)

In [95]:
LBBM.masks_explain(tf.convert_to_tensor(xvalid.values[:1]))

[<tf.Tensor: shape=(1, 7), dtype=float32, numpy=
 array([[0.5029239 , 0.42061815, 0.5069419 , 0.64440614, 0.4996157 ,
         0.48026425, 0.5029309 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=
 array([[0.00263649, 0.3100928 , 0.37382498, 0.9999065 , 0.11081848,
         0.48007822, 0.9813574 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=
 array([[0.07820195, 0.50327915, 0.5153227 , 0.14034256, 0.40772247,
         0.4254523 , 0.0109823 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 7), dtype=float32, numpy=
 array([[0.09239924, 0.49534506, 0.52488846, 0.80234516, 0.3327977 ,
         0.99961275, 0.01491126]], dtype=float32)>]

In [103]:
p = LBBM.predict(xvalid.values).flatten().round().astype(int)
print('\t\t\tCLASSIFICATIION METRICS: LBBM\n')
print(metrics.classification_report(yvalid, p))
score = roc_auc_score(yvalid, p)
print('ROC value is: {}'.format(score))

			CLASSIFICATIION METRICS: LBBM

              precision    recall  f1-score   support

           0       0.61      0.08      0.14    343463
           1       0.67      0.97      0.79    656537

    accuracy                           0.67   1000000
   macro avg       0.64      0.53      0.47   1000000
weighted avg       0.65      0.67      0.57   1000000

ROC value is: 0.5265651853551788
