In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import spacy
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from ast import literal_eval

In [2]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.preprocessing import LabelEncoder

In [3]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [4]:
texts = data_train['text'].tolist()
labels = data_train['classification'].tolist()

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='tf')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='tf')

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Dauer: ca. 1 Minute

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        logits = model(inputs)[0]
        current_loss = loss(labels, logits)
    gradients = tape.gradient(current_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(current_loss)
    train_accuracy(labels, logits)

epochs = 10
batch_size = 16
steps_per_epoch = len(train_texts) // batch_size

for epoch in range(epochs):
    train_loss.reset_states()
    train_accuracy.reset_states()

    for step in range(steps_per_epoch):
        batch_inputs = {key: value[step * batch_size:(step + 1) * batch_size] for key, value in train_encodings.items()}
        batch_labels = train_labels[step * batch_size:(step + 1) * batch_size]
        train_step(batch_inputs, batch_labels)

    print(f'Epoch {epoch + 1}: Loss {train_loss.result()}, Accuracy {train_accuracy.result()}')

try:
    model.save("../../models/classification/model")
except:
    print("Save did not work!")
    
# Dauer: etwa 2h

Epoch 1: Loss 0.2991816997528076, Accuracy 0.9537500143051147
Epoch 2: Loss 0.024501144886016846, Accuracy 0.9962499737739563
Epoch 3: Loss 0.006143177859485149, Accuracy 1.0
Epoch 4: Loss 0.0032362458296120167, Accuracy 1.0
Epoch 5: Loss 0.0020876352209597826, Accuracy 1.0
Epoch 6: Loss 0.0014592695515602827, Accuracy 1.0
Epoch 7: Loss 0.001074157189577818, Accuracy 1.0
Epoch 8: Loss 0.0008195683476515114, Accuracy 1.0
Epoch 9: Loss 0.0006421889993362129, Accuracy 1.0
Epoch 10: Loss 0.0005138112464919686, Accuracy 1.0




INFO:tensorflow:Assets written to: ../../models/classification/model\assets


INFO:tensorflow:Assets written to: ../../models/classification/model\assets


In [7]:
test_inputs = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask']
}

test_loss, test_accuracy = model.evaluate(test_inputs, test_labels)
print('Test Accuracy:', test_accuracy)

ValueError: Creating variables on a non-first call to a function decorated with tf.function.