In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, TFDistilBertModel
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np


In [None]:
preprocessed_data =  pd.read_csv('<<preprocessed data file path>>')

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoded_data = tokenizer.batch_encode_plus(
    preprocessed_data['mouse_data'],
    padding=True,
    truncation=True,
    max_length=512,
    return_attention_mask=True,
    return_token_type_ids=False,
    return_tensors='tf'
)
encoded_labels = preprocessed_data['label'].values

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    encoded_data['input_ids'].numpy(), encoded_labels, test_size=0.2, random_state=42
)

train_masks, test_masks, _, _ = train_test_split(
    encoded_data['attention_mask'].numpy(), encoded_labels, test_size=0.2, random_state=42
)

Sequence Classification

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
model.summary()

In [None]:
history = model.fit(
    {'input_ids': train_inputs, 'attention_mask': train_masks},
    train_labels,
    validation_data=({'input_ids': test_inputs, 'attention_mask': test_masks}, test_labels),
    batch_size=16
)

GRU

In [None]:
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

input_layer = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_layer')
mask_layer = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='mask_layer')
distilbert_output = distilbert_model({'input_ids': input_layer, 'attention_mask': mask_layer})[0]
gru_output = tf.keras.layers.GRU(units=32)(distilbert_output)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(gru_output)
model = tf.keras.Model(inputs=[input_layer, mask_layer], outputs=output_layer)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x=[train_inputs, train_masks], y=train_labels, validation_data=([test_inputs, test_masks], test_labels),  batch_size=16)

LSTM

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [None]:
distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')


input_layer = Input(shape=(512,), dtype=tf.int32, name='input_layer')
mask_layer = Input(shape=(512,), dtype=tf.int32, name='mask_layer')


distilbert_output = distilbert(input_layer, attention_mask=mask_layer)[0]

lstm_output = LSTM(128)(distilbert_output)

output_layer = Dense(1, activation='sigmoid')(lstm_output)

model_inputs = [input_layer, mask_layer]
model_outputs = output_layer

model = Model(inputs=model_inputs, outputs=model_outputs)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    {'input_layer': train_inputs, 'mask_layer': train_masks},
    train_labels,
    batch_size=16,
    validation_data=({'input_layer': test_inputs, 'mask_layer': test_masks}, test_labels)
)

BiLSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional, concatenate
from transformers import TFDistilBertModel, DistilBertTokenizer


In [None]:
from tensorflow.keras.layers import Input, Dense, Concatenate, TimeDistributed, Bidirectional, LSTM

In [None]:
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

input_layer = Input(shape=(512,), dtype=tf.int32, name='input_layer')
mask_layer = Input(shape=(512,), dtype=tf.int32, name='mask_layer')

distilbert_model.trainable = False

distilbert_output = distilbert_model(input_layer, attention_mask=mask_layer)[0]

lstm_layer = Bidirectional(LSTM(64, return_sequences=False))(distilbert_output)

combined_layer = Concatenate()([distilbert_output[:, 0, :], lstm_layer])

dense_layer = Dense(128, activation='relu')(combined_layer)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

In [None]:
model = tf.keras.models.Model(inputs=[input_layer, mask_layer], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    {'input_layer': train_inputs, 'mask_layer': train_masks},
    train_labels,
    batch_size=8,
    validation_data=({'input_layer': test_inputs, 'mask_layer': test_masks}, test_labels)
)