In [33]:
import json
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import ElectraForPreTraining, ElectraTokenizerFast
from transformers import AdamW

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# Define hyperparameters
num_epochs = 10
batch_size = 16
learning_rate = 2e-5

In [34]:
# Load tokenizer and model
model = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")
num_classes = 5
#model = ElectraForSequenceClassification.from_pretrained('google/bio-electra-base-discriminator', num_labels=num_classes)
model.to(device)

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForPreTraining: ['electra.embeddings_project.bias', 'electra.embeddings_project.weight']
- This IS expected if you are initializing ElectraForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

ElectraForPreTraining(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

# Prepare data

In [18]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

dataset = load_data("ESConv.json")
dataframe = pd.DataFrame(dataset)

In [20]:
def extract_dialog(dialog, start_percentage, end_percentage):
    if isinstance(dialog, list):
        seeker_contents = [item['content'] for item in dialog if item['speaker'] == 'seeker']
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return ' '.join(seeker_contents[start_index:end_index])
    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence for sentence in sentences] #[str, str, ..., str]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return seeker_contents[start_index:end_index]
    else:
        return None

In [22]:
df = pd.DataFrame()
df['dialog'] = dataframe['dialog'].apply(lambda x: extract_dialog(x, 0, 1)) #take whole dialog from seeker
df['dialog'] = df['dialog'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df['initial_emotion_intensity'] = dataframe['survey_score'].apply(
    lambda x: x['seeker']['initial_emotion_intensity'])
df['initial_emotion_intensity'].dropna(inplace=True)
df['initial_emotion_intensity'] = df['initial_emotion_intensity'].astype(int)
df['dialog'].dropna(inplace=True)
#train_loader = DataLoader(df, batch_size=32, shuffle=True)

In [23]:
df

Unnamed: 0,dialog,initial_emotion_intensity
0,Hello\n I am having a lot of anxiety about qui...,5
1,hello im looking for someone to talk to im fi...,5
2,Hello I'm concerned about my job. I have been ...,4
3,I am dong good. You?\n I have been staying hom...,4
4,Infinitely complicated.\n Too many decisions. ...,5
...,...,...
1295,I feel sleepy but can not sleep It has always ...,5
1296,I am fine. thanks. how about you ?\n I lost my...,4
1297,"HI how are you today\n Doing well, thanks.\n N...",3
1298,Hello\n I am a little down today. How are you...,3


In [24]:
def train_test_split_part_dialog(dialog, labels, start_percentage, end_percentage, test_start, test_end):
    train_data = []
    test_data = []

    if isinstance(labels, pd.Series):
        labels = labels.tolist()

    # Ensure labels are within the range of 0 to n_classes - 1
    min_label = min(labels)
    train_labels = labels
    test_labels = labels

    if isinstance(dialog, list):
        for conv in dialog:
            seeker_contents = [item['content'] for item in conv if item['speaker'] == 'seeker']
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            train_data.extend(seeker_contents[start_index:end_index])
            test_data.extend(seeker_contents[test_start_index:test_end_index])

    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence.strip() for sentence in sentences]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        test_start_index = int(test_start * len(seeker_contents))
        test_end_index = int(test_end * len(seeker_contents))

        train_data = seeker_contents[start_index:end_index]
        test_data = seeker_contents[test_start_index:test_end_index]

    elif isinstance(dialog, pd.Series):
        for conv in dialog:
            seeker_contents = conv
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            # train_data.append((seeker_contents[start_index:end_index],))  # Append as tuple
            # test_data.append((seeker_contents[test_start_index:test_end_index],))  # Append as tuple
            train_data.append(' '.join(seeker_contents[start_index:end_index]))
            test_data.append(' '.join(seeker_contents[test_start_index:test_end_index]))

    return tuple(train_data), tuple(test_data), train_labels, test_labels

In [25]:
train_data, test_data, train_labels, test_labels = train_test_split_part_dialog(
    df['dialog'], df['initial_emotion_intensity'],
    start_percentage=0, end_percentage=0.2,
    test_start=0.2, test_end=0.4)

In [28]:
# standard division on test and train
train_dataset, val_dataset, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=1)

In [29]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [36]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
loss_fn = torch.nn.CrossEntropyLoss()

In [37]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in train_dataloader:
        inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted_labels = torch.max(logits, dim=1)
        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / total_samples

    # Evaluation on validation set
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0

        for batch in val_dataloader:
            inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            total_val_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            total_val_correct += (predicted_labels == labels).sum().item()
            total_val_samples += labels.size(0)

        val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = total_val_correct / total_val_samples

    # Print epoch results
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f} - Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

TypeError: list indices must be integers or slices, not str