In [1]:
import json
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import ElectraForPreTraining, ElectraTokenizerFast
from transformers import AdamW

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Define hyperparameters
num_epochs = 10
batch_size = 16
learning_rate = 2e-5

In [4]:
# Load tokenizer and model
model = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")
num_classes = 5
#model = ElectraForSequenceClassification.from_pretrained('google/bio-electra-base-discriminator', num_labels=num_classes)
model.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForPreTraining: ['electra.embeddings_project.bias', 'electra.embeddings_project.weight']
- This IS expected if you are initializing ElectraForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ElectraForPreTraining(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

# Prepare data

In [5]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

dataset = load_data("ESConv.json")
dataframe = pd.DataFrame(dataset)

In [6]:
def extract_dialog(dialog, start_percentage, end_percentage):
    if isinstance(dialog, list):
        seeker_contents = [item['content'] for item in dialog if item['speaker'] == 'seeker']
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return ' '.join(seeker_contents[start_index:end_index])
    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence for sentence in sentences] #[str, str, ..., str]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        return seeker_contents[start_index:end_index]
    else:
        return None

In [7]:
df = pd.DataFrame()
df['dialog'] = dataframe['dialog'].apply(lambda x: extract_dialog(x, 0, 1)) #take whole dialog from seeker
df['dialog'] = df['dialog'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

df['initial_emotion_intensity'] = dataframe['survey_score'].apply(
    lambda x: x['seeker']['initial_emotion_intensity'])
df['initial_emotion_intensity'].dropna(inplace=True)
df['initial_emotion_intensity'] = df['initial_emotion_intensity'].astype(int)
df['dialog'].dropna(inplace=True)
#train_loader = DataLoader(df, batch_size=32, shuffle=True)

In [8]:
df

Unnamed: 0,dialog,initial_emotion_intensity
0,Hello\n I am having a lot of anxiety about qui...,5
1,hello im looking for someone to talk to im fi...,5
2,Hello I'm concerned about my job. I have been ...,4
3,I am dong good. You?\n I have been staying hom...,4
4,Infinitely complicated.\n Too many decisions. ...,5
...,...,...
1295,I feel sleepy but can not sleep It has always ...,5
1296,I am fine. thanks. how about you ?\n I lost my...,4
1297,"HI how are you today\n Doing well, thanks.\n N...",3
1298,Hello\n I am a little down today. How are you...,3


In [9]:
def train_test_split_part_dialog(dialog, labels, start_percentage, end_percentage, test_start, test_end):
    train_data = []
    test_data = []

    if isinstance(labels, pd.Series):
        labels = labels.tolist()

    # Ensure labels are within the range of 0 to n_classes - 1
    min_label = min(labels)
    train_labels = labels
    test_labels = labels

    if isinstance(dialog, list):
        for conv in dialog:
            seeker_contents = [item['content'] for item in conv if item['speaker'] == 'seeker']
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            train_data.extend(seeker_contents[start_index:end_index])
            test_data.extend(seeker_contents[test_start_index:test_end_index])

    elif isinstance(dialog, str):
        sentences = dialog.split('.')
        seeker_contents = [sentence.strip() for sentence in sentences]
        start_index = int(start_percentage * len(seeker_contents))
        end_index = int(end_percentage * len(seeker_contents))
        test_start_index = int(test_start * len(seeker_contents))
        test_end_index = int(test_end * len(seeker_contents))

        train_data = seeker_contents[start_index:end_index]
        test_data = seeker_contents[test_start_index:test_end_index]

    elif isinstance(dialog, pd.Series):
        for conv in dialog:
            seeker_contents = conv
            start_index = int(start_percentage * len(seeker_contents))
            end_index = int(end_percentage * len(seeker_contents))
            test_start_index = int(test_start * len(seeker_contents))
            test_end_index = int(test_end * len(seeker_contents))

            # train_data.append((seeker_contents[start_index:end_index],))  # Append as tuple
            # test_data.append((seeker_contents[test_start_index:test_end_index],))  # Append as tuple
            train_data.append(' '.join(seeker_contents[start_index:end_index]))
            test_data.append(' '.join(seeker_contents[test_start_index:test_end_index]))

    return tuple(train_data), tuple(test_data), train_labels, test_labels

In [10]:
train_data, test_data, train_labels, test_labels = train_test_split_part_dialog(
    df['dialog'], df['initial_emotion_intensity'],
    start_percentage=0, end_percentage=0.2,
    test_start=0.2, test_end=0.4)

In [11]:
# standard division on test and train
train_dataset, val_dataset, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=1)

In [12]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.labels[idx], self.data[idx]

train_dataset = CustomDataset(train_dataset, train_labels)
test_dataset = CustomDataset(val_dataset, val_labels)

In [13]:
for i in train_dataset:
    print(i)

(5, "H i ,   I ' m   p r e t t y   m u c h   d o w n \n   t r y i n g   t o   k e e p   m y   h e a d   a f l o a t \n   I   l o s t   m y   j o b   s o m e   m o n t h s   b a c k   a n d   i t   h a s n ' t   r e a l l y   b e e n   e a s y   f o r   m e   S o m e t i m e s   w h e n   I   e n t e r   a   p l a c e   w h e r e   m y   f r i e n d s  ")
(4, "G o o d   m o r n i n g \n   N o t   t o o   b a d   j u s t   w a n t e d   t o   t a l k   w i t h   y o u   a b o u t   m y   a l c o h o l   a d d i c t i o n   i f   y o u   d o n ' t   m i n d . \n   T h e   t h i n g   i s   I   a m   d r i n k i n g   b e e r   a l m o s t   e v e r y   d a y   s i n c e   I   s t a r t e d   t o   w o r k   f r o m   h o m e .   A n d   w i t h   w e a t h e r   s o   n i")
(4, "H e l l o   t h e r e . \n   C o u l d   b e   b e t t e r ,   I   f e e l   a n x i e t y   a b o u t   m y   U n i   e x a m s .   I   f e e l   I   w i l l   f a i l   i t   a l l   t h i s   y e a r \n   I   j

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
for i, batch in train_dataloader:
    print("Texts:", batch)
    print("Labels:", i)

Texts: ("I ' m   g o o d .   H o w   a r e   y o u ? \n   M y   b o a r d   p r e s i d e n t   i s   g e t t i n g   t o o   i n v o l v e d   w i t h   t h e   s t a f f   a n d   i t   i s   d r i v i n g   m e   n u t s ! \n   I t   i s .   H e   j u s t   s a i d   y e s t e r d a y   t h a t   h e   w o u l d   n o t   g o   t o   m y   s t a f f   b u t   h e   i s .   R i g h t   a f t e r   t h e   m e e t i n g .   T h a n k", "I   f e e l   h e a v y   p r e s s u r e s   f r o m   m y   p a r e n t s ,   t h e y   a l s o   w a n t   m e   t o   b e   t h e   t o p   o f   t h e   c l a s s   b u t   I   f i n d   i t   t o u g h   b e c a u s e   t h e y   a r e   o t h e r   t o p   s t u d e n t s   P l e a s e   w h a t   d o   I   d o ? \n   H e l l o ? \n   N o t   s o   g o o d ,   I ' m   f e e l i n g   d o w n \n   M y   p a r e n t s   k e e p   p u t t i n g   p r e s s u r e s   o n   m e   t o   b e   t h e   b e s t   i n   t h e   c l a s s   a n d   w i t h

In [16]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for i, batch in train_dataloader:
        print('Training...')
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = i.to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted_labels = torch.max(logits, dim=1)
        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / total_samples

    # Evaluation on validation set
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0

        for i, batch in val_dataloader:
            print('Validation...')
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = i.to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            total_val_loss += loss.item()
            _, predicted_labels = torch.max(logits, dim=1)
            total_val_correct += (predicted_labels == labels).sum().item()
            total_val_samples += labels.size(0)

        val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = total_val_correct / total_val_samples

    # Print epoch results
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f} - Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")