[https://www.kaggle.com/riteshkrjha/riiid-quick-tabnet-vs-xgboost](https://www.kaggle.com/riteshkrjha/riiid-quick-tabnet-vs-xgboost)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, classification_report
import tensorflow as tf

from LorisNet import *

In [2]:
train = pd.read_csv('./data/riid/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [3]:
# Remove lectures and additional processing
train = train[train.content_type_id == False]

train = train.sort_values(['timestamp'],
                          ascending=True)
train.drop(['timestamp', 'content_type_id'],
           axis=1,
           inplace=True)

In [4]:
# Read Questions and Lectures
questions = pd.read_csv('./data/riid/questions.csv')
lectures = pd.read_csv('./data/riid/lectures.csv')

In [5]:
# Merge train with Questions
train = pd.merge(train,
                 questions,
                 left_on='content_id',
                 right_on='question_id',
                 how='left')

In [6]:
#Indicator for first question in a batch
train['firstQindicator'] = np.where(train['prior_question_elapsed_time'].isnull(),
                                    1,
                                    0)
train['prior_question_elapsed_time'] = np.where(train['prior_question_elapsed_time'].isnull(),
                                                0,
                                                train['prior_question_elapsed_time'])

In [7]:
train.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags,firstQindicator
0,115,5692,1,1,0.0,,5692,5692,3,5,151,1
1,1805962620,5547,0,0,0.0,,5547,5547,0,5,8,1
2,2015251289,4024,0,1,0.0,,4024,4024,0,5,173,1
3,867941388,6659,0,1,0.0,,6659,6659,3,5,53,1
4,867946278,3977,0,1,0.0,,3977,3977,2,5,177,1


In [8]:
# Remove unused columns
del train['question_id']
del train['bundle_id']
del train['correct_answer']
del train['tags']

In [9]:
import gc
gc.collect()

0

In [10]:
train.prior_question_had_explanation = train.prior_question_had_explanation.astype(object)
train.prior_question_had_explanation = np.where(train.prior_question_had_explanation == 'True',
                                                1,
                                                0)

In [11]:
# Sample 5M records
train = train.sample(n=5000000)

In [129]:
# train test split
xtrain, xvalid, ytrain, yvalid = train_test_split(train.drop(['answered_correctly'], axis=1), 
                                                  train['answered_correctly'],
                                                  random_state=42, 
                                                  test_size=0.2, 
                                                  shuffle=True)

In [130]:
data = tf.data.Dataset.from_tensor_slices((xtrain.values,
                                           ytrain.values)).batch(100000)

In [131]:
# Train LorisNet
model = tf.keras.Sequential()

reg1 = .1
nbr_masks = 5
steps = [[StepNoFeedback(AllOnesMaskedInputNoFeedback(nbr_masks),
                         LinearSeparators(bias_regularizer=tf.keras.regularizers.L2(reg1)),
                         PredictionNeurons(units=1))]]
for i in range(10):
    steps.append([StepWithFeedback(AllOnesMaskedInputWithFeedback(nbr_masks),
                                   LinearSeparators(bias_regularizer=tf.keras.regularizers.L2(reg1)),
                                   PredictionNeurons(units=1))])
steps = [item for sublist in steps for item in sublist]

model.add(LorisNetLayer(steps=steps,
                        weighted_addition=NormalizedWeightedAdd(),
                        activation='sigmoid'
                       ))

model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(learning_rate=.05),
              metrics=['accuracy']
             )

history = model.fit(data,
                    epochs=1,
                    verbose=1)



In [132]:
model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer=tf.keras.optimizers.Adam(learning_rate=.01),
              metrics=['accuracy']
             )
history = model.fit(data,
                    epochs=1,
                    verbose=1)



In [133]:
model.predict(xtrain.values[:20])



array([[0.7035527 ],
       [0.7121538 ],
       [0.6951014 ],
       [0.72549   ],
       [0.5595709 ],
       [0.6556799 ],
       [0.66519415],
       [0.73769385],
       [0.54684764],
       [0.73502547],
       [0.6346817 ],
       [0.7519738 ],
       [0.6552086 ],
       [0.6318636 ],
       [0.59619856],
       [0.6771714 ],
       [0.68270916],
       [0.6549076 ],
       [0.6854434 ],
       [0.66511256]], dtype=float32)

In [134]:
ytrain[:20]

13595083    1
45569082    1
37440363    1
93119698    1
81433823    1
77163944    0
71713307    1
46984142    1
1865578     0
98234157    0
73687529    1
96957632    1
73410585    0
58515061    0
10093544    1
42222471    0
63330353    0
8832242     1
46180318    1
28323868    1
Name: answered_correctly, dtype: int8

In [135]:
p = model.predict(tf.data.Dataset.from_tensor_slices(xvalid.values).batch(100000)).flatten().round().astype(int)
print('\t\t\tCLASSIFICATIION METRICS: LBBM\n')
print(classification_report(yvalid, p))
score = roc_auc_score(yvalid, p)
print('ROC value is: {}'.format(score))

			CLASSIFICATIION METRICS: LBBM



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    342223
           1       0.66      1.00      0.79    657777

    accuracy                           0.66   1000000
   macro avg       0.33      0.50      0.40   1000000
weighted avg       0.43      0.66      0.52   1000000

ROC value is: 0.5


In [136]:
model.predict(xvalid.values[:20])



array([[0.62341654],
       [0.69408244],
       [0.6538562 ],
       [0.6602304 ],
       [0.7067075 ],
       [0.67072743],
       [0.6897935 ],
       [0.65889984],
       [0.6632564 ],
       [0.6940458 ],
       [0.64580053],
       [0.7070672 ],
       [0.5460276 ],
       [0.74241453],
       [0.7094348 ],
       [0.67945886],
       [0.69767433],
       [0.6136378 ],
       [0.62348825],
       [0.7499299 ]], dtype=float32)

In [137]:
yvalid[:20]

55946176    1
50927278    1
47264040    1
17765604    0
82342456    1
24232254    1
36885350    0
64858554    1
33751180    1
57737698    0
91152908    1
55760771    1
1257252     1
43416822    0
41706295    1
37473961    1
35514488    1
87989056    1
59645676    0
87095933    1
Name: answered_correctly, dtype: int8