In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import spacy
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from ast import literal_eval

In [21]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.preprocessing import LabelEncoder

In [22]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [24]:
texts = data_train['text'].tolist()
labels = data_train['classification'].tolist()

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='tf')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='tf')

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_115']
You should probably TRAIN this model on a down-stream task to be able to use 

In [27]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(16)
model.fit(train_dataset, epochs=10)

_, accuracy = model.evaluate(test_encodings, test_labels)
print('Test Accuracy:', accuracy)

Epoch 1/10


ValueError: Creating variables on a non-first call to a function decorated with tf.function.

In [28]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        logits = model(inputs)[0]
        current_loss = loss(labels, logits)
    gradients = tape.gradient(current_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(current_loss)
    train_accuracy(labels, logits)

epochs = 10
batch_size = 16
steps_per_epoch = len(train_texts) // batch_size

for epoch in range(epochs):
    train_loss.reset_states()
    train_accuracy.reset_states()

    for step in range(steps_per_epoch):
        batch_inputs = {key: value[step * batch_size:(step + 1) * batch_size] for key, value in train_encodings.items()}
        batch_labels = train_labels[step * batch_size:(step + 1) * batch_size]
        train_step(batch_inputs, batch_labels)

    print(f'Epoch {epoch + 1}: Loss {train_loss.result()}, Accuracy {train_accuracy.result()}')

Epoch 1: Loss 0.4459410607814789, Accuracy 0.8762500286102295
Epoch 2: Loss 0.024845415726304054, Accuracy 0.9981250166893005
Epoch 3: Loss 0.010500618256628513, Accuracy 0.9993749856948853
Epoch 4: Loss 0.007588009350001812, Accuracy 0.9993749856948853
Epoch 5: Loss 0.006095245014876127, Accuracy 0.9993749856948853
Epoch 6: Loss 0.0016077675390988588, Accuracy 1.0
Epoch 7: Loss 0.0011488060699775815, Accuracy 1.0
Epoch 8: Loss 0.0008655996061861515, Accuracy 1.0
Epoch 9: Loss 0.0006730896420776844, Accuracy 1.0
Epoch 10: Loss 0.000535606755875051, Accuracy 1.0


ValueError: Cannot generate a hashable key for IteratorSpec(({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None)),) because the _serialize() method returned an unsupproted value of type <class 'transformers.tokenization_utils_base.BatchEncoding'>

In [None]:
test_loss, test_accuracy = model.evaluate(test_encodings, test_labels)
print('Test Accuracy:', test_accuracy)