In [1]:
import json
import pandas as pd
import numpy as np
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertModel


from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
NUM_CLASSES = 4
MAX_LEN = 100
BATCH = 8
DROPOUT_RATE = 0.4
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

In [3]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

def extract_seeker_data(data, key):
    result = []

    for entry in data:
        dialog = entry['dialog']
        seeker_dialog = [item['content'].strip() for item in dialog if item['speaker'] == 'seeker']

        quarter_length = max(1, len(seeker_dialog) // 4)

        if key == 'initial_emotion_intensity':
            selected_dialog = seeker_dialog[:quarter_length]
        elif key == 'final_emotion_intensity':
            selected_dialog = seeker_dialog[-quarter_length:]
        else:
            continue

        result.append({
            key: entry['survey_score']['seeker'][key],
            'dialog': selected_dialog
        })

    return result

dataset = load_data('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/ESConv.json')

first_25_percent = extract_seeker_data(dataset, 'initial_emotion_intensity')
first_25_df = pd.DataFrame(first_25_percent)
first_25_df.head()

Unnamed: 0,initial_emotion_intensity,dialog
0,5,"[Hello, I am having a lot of anxiety about qui..."
1,5,"[hello im looking for someone to talk to, im f..."
2,4,"[Hello, I'm concerned about my job. I have bee..."
3,4,"[I am dong good. You?, I have been staying hom..."
4,5,"[Infinitely complicated., Too many decisions. ..."


In [4]:
label_counts = first_25_df['initial_emotion_intensity'].value_counts()
least_common_label = label_counts.idxmin()
first_25_df = first_25_df[first_25_df['initial_emotion_intensity'] != least_common_label]
first_25_df['initial_emotion_intensity'] = pd.to_numeric(first_25_df['initial_emotion_intensity'], errors='coerce')
first_25_df['initial_emotion_intensity'] = first_25_df['initial_emotion_intensity'] - 2

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = df['dialog']
        self.targets = self.df['initial_emotion_intensity'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'utterances': utterances
        }

In [6]:
class DistilBERT_IntensityClass(torch.nn.Module):
    def __init__(self, distilbert_model, dropout_rate=DROPOUT_RATE, num_classes=NUM_CLASSES):
        super(DistilBERT_IntensityClass, self).__init__()
        self.distilbert_model = distilbert_model
        self.dropout = torch.nn.Dropout(p=DROPOUT_RATE)
        self.linear = torch.nn.Linear(self.distilbert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attn_mask):
        # DistilBERT model processing
        output = self.distilbert_model(input_ids, attention_mask=attn_mask)

        # Use the last hidden state (the embedding for [CLS] token is at index 0)
        cls_output = output.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]
        # Apply dropout
        dropout_output = self.dropout(cls_output)
        # Get final class logits
        linear_output = self.linear(dropout_output)
        return linear_output


In [8]:
distilbert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = DistilBERT_IntensityClass(distilbert_model)
model.load_state_dict(torch.load("best_model_state.bin"))
model.to(device)
model.eval()

tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
valid_dataset = CustomDataset(first_25_df, tokenizer, MAX_LEN)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

  model.load_state_dict(torch.load("best_model_state.bin"))


In [10]:
def validate_model(data_loader, model):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attn_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

# Run validation
predictions, true_labels = validate_model(val_data_loader, model)

# Calculate metrics
from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions))


KeyError: 411