In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import json

def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

def extract_dialog(dialog, start_percentage, end_percentage):
    if isinstance(dialog, list):
        seeker_contents = [item['content'] for item in dialog if item['speaker'] == 'seeker']
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return ' '.join(seeker_contents[start_index:end_index])
    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence for sentence in sentences] #[str, str, ..., str]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return seeker_contents[start_index:end_index]
    else:
        return None

def train_test_split_part_dialog(dialog, labels, start_percentage, end_percentage, test_start, test_end):
    train_data = []
    test_data = []

    if isinstance(labels, pd.Series):
        labels = labels.tolist()

    # Ensure labels are within the range of 0 to n_classes - 1
    min_label = min(labels)
    train_labels = labels
    test_labels = labels

    if isinstance(dialog, list):
        for conv in dialog:
            seeker_contents = [item['content'] for item in conv if item['speaker'] == 'seeker']
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            train_data.extend(seeker_contents[start_index:end_index])
            test_data.extend(seeker_contents[test_start_index:test_end_index])

    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence.strip() for sentence in sentences]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        test_start_index = int(test_start * len(seeker_contents))
        test_end_index = int(test_end * len(seeker_contents))

        train_data = seeker_contents[start_index:end_index]
        test_data = seeker_contents[test_start_index:test_end_index]

    elif isinstance(dialog, pd.Series):
        for conv in dialog:
            seeker_contents = conv
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            # train_data.append((seeker_contents[start_index:end_index],))  # Append as tuple
            # test_data.append((seeker_contents[test_start_index:test_end_index],))  # Append as tuple
            train_data.append(' '.join(seeker_contents[start_index:end_index]))
            test_data.append(' '.join(seeker_contents[test_start_index:test_end_index]))

    return tuple(train_data), tuple(test_data), train_labels, test_labels


In [2]:
dataset = load_data("ESConv.json")
dataframe = pd.DataFrame(dataset)

df = pd.DataFrame(dataset)

df['dialog'] = dataframe['dialog'].apply(lambda x: extract_dialog(x, 0, 1)) #take whole dialog from seeker
df['dialog'] = df['dialog'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df['initial_emotion_intensity'] = dataframe['survey_score'].apply(
    lambda x: x['seeker']['initial_emotion_intensity'])
df['initial_emotion_intensity'].dropna(inplace=True)
df['initial_emotion_intensity'] = df['initial_emotion_intensity'].astype(int)
df['dialog'].dropna(inplace=True)

train_data, test_data, train_labels, test_labels = train_test_split_part_dialog(
    df['dialog'], df['initial_emotion_intensity'],
    start_percentage=0, end_percentage=0.2,
    test_start=0.2, test_end=0.4)

# Podział na dane treningowe, testowe i walidacyjne
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

In [5]:
# Inicjalizacja modelu BERT i tokenizer'a
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline
checkpoint_EmoRoberta = "arpanghoshal/EmoRoBERTa"
# checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_EmoRoberta)
#model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model = TFRobertaForSequenceClassification.from_pretrained(checkpoint_EmoRoberta)

# Tokenizacja danych
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Dodanie etykiet do danych treningowych
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

tf_model.h5:   0%|          | 0.00/501M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
  train_labels = torch.tensor(train_labels)
  val_labels = torch.tensor(val_labels)
  test_labels = torch.tensor(test_labels)


In [8]:
# Trening modelu "arpanghoshal/EmoRoBERTa"
import tensorflow as tf

# Definicja hiperparametrów
epochs = 3
batch_size = 32
learning_rate = 1e-5

# Definicja optymalizatora
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Przygotowanie danych treningowych w formacie TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_encodings)).batch(batch_size)

# Definicja funkcji straty
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Pętla treningowa
for epoch in range(epochs):
    for batch in train_dataset:
        inputs, labels = batch
        with tf.GradientTape() as tape:
            logits = model(inputs)[0]
            loss = loss_fn(labels, logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Obliczenie straty dla danych treningowych na końcu epoki
    train_loss = 0
    for batch in train_dataset:
        inputs, labels = batch
        logits = model(inputs)[0]
        train_loss += loss_fn(labels, logits)
    train_loss /= len(train_dataset)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}")


ResourceExhaustedError: Exception encountered when calling layer 'self' (type TFRobertaSelfAttention).

{{function_node __wrapped__RealDiv_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[32,12,512,512] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:RealDiv] name: 

Call arguments received by layer 'self' (type TFRobertaSelfAttention):
  • hidden_states=tf.Tensor(shape=(32, 512, 768), dtype=float32)
  • attention_mask=tf.Tensor(shape=(32, 1, 1, 512), dtype=float32)
  • head_mask=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_value=None
  • output_attentions=False
  • training=False

In [7]:
# Trening modelu "bert-base-uncased"
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    train_loss = model(input_ids=train_encodings['input_ids'], attention_mask=train_encodings['attention_mask'], labels=train_labels).loss
    train_loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}: Train loss {train_loss}")

AttributeError: 'TFRobertaForSequenceClassification' object has no attribute 'train'

In [ ]:
# Testowanie modelu
model.eval()
with torch.no_grad():
    val_loss = model(input_ids=val_encodings['input_ids'], attention_mask=val_encodings['attention_mask'], labels=val_labels).loss
    print(f"Validation loss: {val_loss}")

In [ ]:
# Ocena na zbiorze testowym
with torch.no_grad():
    outputs = model(input_ids=test_encodings['input_ids'], attention_mask=test_encodings['attention_mask'])
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    accuracy = (predictions == test_labels).float().mean()
    print(f"Accuracy on test set: {accuracy.item()}")
