In [1]:
import json
import pandas as pd
import numpy as np
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification


from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
NUM_CLASSES = 4
MAX_LEN = 100
BATCH = 8
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

In [3]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

def extract_seeker_data(data, key):
    result = []

    for entry in data:
        dialog = entry['dialog']
        seeker_dialog = [item['content'].strip() for item in dialog if item['speaker'] == 'seeker']

        quarter_length = max(1, len(seeker_dialog) // 4)

        if key == 'initial_emotion_intensity':
            selected_dialog = seeker_dialog[:quarter_length]
        elif key == 'final_emotion_intensity':
            selected_dialog = seeker_dialog[-quarter_length:]
        else:
            continue

        result.append({
            key: entry['survey_score']['seeker'][key],
            'dialog': selected_dialog
        })

    return result

dataset = load_data('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/ESConv.json')

first_25_percent = extract_seeker_data(dataset, 'initial_emotion_intensity')
first_25_df = pd.DataFrame(first_25_percent)
first_25_df.head()

Unnamed: 0,initial_emotion_intensity,dialog
0,5,"[Hello, I am having a lot of anxiety about qui..."
1,5,"[hello im looking for someone to talk to, im f..."
2,4,"[Hello, I'm concerned about my job. I have bee..."
3,4,"[I am dong good. You?, I have been staying hom..."
4,5,"[Infinitely complicated., Too many decisions. ..."


In [15]:
label_counts = first_25_df['initial_emotion_intensity'].value_counts()
least_common_label = label_counts.idxmin()
first_25_df = first_25_df[first_25_df['initial_emotion_intensity'] != least_common_label]
first_25_df['initial_emotion_intensity'] = pd.to_numeric(first_25_df['initial_emotion_intensity'], errors='coerce')
first_25_df['initial_emotion_intensity'] = first_25_df['initial_emotion_intensity'] - 2

In [19]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = df['dialog']
        self.targets = self.df['initial_emotion_intensity'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index])

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'utterances': utterances
        }

In [20]:
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
valid_dataset = CustomDataset(first_25_df, tokenizer, MAX_LEN)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH, shuffle=False, num_workers=0)

In [26]:
model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=NUM_CLASSES)
model.to(device)

model.load_state_dict(torch.load('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/best_model_state.bin'))
model.eval()

def validate_model(model, dataloader, criterion):
    model.eval() 
    val_loss = 0
    correct = 0
    total = 0
    all_targets = []
    all_preds = []

    with torch.no_grad(): 
        for batch in tq.tqdm(dataloader, desc="Validation", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, targets)
            val_loss += loss.item()

            _, preds = torch.max(logits, dim=1)

            correct += (preds == targets).sum().item()
            total += targets.size(0)
            all_targets.extend(targets.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    avg_loss = val_loss / len(dataloader)
    accuracy = correct / total
    f1 = f1_score(all_targets, all_preds, average='weighted')

    return avg_loss, accuracy, f1

criterion = nn.CrossEntropyLoss()
val_loss, val_accuracy, val_f1 = validate_model(model, val_data_loader, criterion)

print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, F1 Score: {val_f1:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('C:/Users/juwieczo/DataspellProjects/meisd_project/pipeline/best_model_state.bin'))


RuntimeError: Error(s) in loading state_dict for DistilBertForSequenceClassification:
	Missing key(s) in state_dict: "distilbert.embeddings.word_embeddings.weight", "distilbert.embeddings.position_embeddings.weight", "distilbert.embeddings.LayerNorm.weight", "distilbert.embeddings.LayerNorm.bias", "distilbert.transformer.layer.0.attention.q_lin.weight", "distilbert.transformer.layer.0.attention.q_lin.bias", "distilbert.transformer.layer.0.attention.k_lin.weight", "distilbert.transformer.layer.0.attention.k_lin.bias", "distilbert.transformer.layer.0.attention.v_lin.weight", "distilbert.transformer.layer.0.attention.v_lin.bias", "distilbert.transformer.layer.0.attention.out_lin.weight", "distilbert.transformer.layer.0.attention.out_lin.bias", "distilbert.transformer.layer.0.sa_layer_norm.weight", "distilbert.transformer.layer.0.sa_layer_norm.bias", "distilbert.transformer.layer.0.ffn.lin1.weight", "distilbert.transformer.layer.0.ffn.lin1.bias", "distilbert.transformer.layer.0.ffn.lin2.weight", "distilbert.transformer.layer.0.ffn.lin2.bias", "distilbert.transformer.layer.0.output_layer_norm.weight", "distilbert.transformer.layer.0.output_layer_norm.bias", "distilbert.transformer.layer.1.attention.q_lin.weight", "distilbert.transformer.layer.1.attention.q_lin.bias", "distilbert.transformer.layer.1.attention.k_lin.weight", "distilbert.transformer.layer.1.attention.k_lin.bias", "distilbert.transformer.layer.1.attention.v_lin.weight", "distilbert.transformer.layer.1.attention.v_lin.bias", "distilbert.transformer.layer.1.attention.out_lin.weight", "distilbert.transformer.layer.1.attention.out_lin.bias", "distilbert.transformer.layer.1.sa_layer_norm.weight", "distilbert.transformer.layer.1.sa_layer_norm.bias", "distilbert.transformer.layer.1.ffn.lin1.weight", "distilbert.transformer.layer.1.ffn.lin1.bias", "distilbert.transformer.layer.1.ffn.lin2.weight", "distilbert.transformer.layer.1.ffn.lin2.bias", "distilbert.transformer.layer.1.output_layer_norm.weight", "distilbert.transformer.layer.1.output_layer_norm.bias", "distilbert.transformer.layer.2.attention.q_lin.weight", "distilbert.transformer.layer.2.attention.q_lin.bias", "distilbert.transformer.layer.2.attention.k_lin.weight", "distilbert.transformer.layer.2.attention.k_lin.bias", "distilbert.transformer.layer.2.attention.v_lin.weight", "distilbert.transformer.layer.2.attention.v_lin.bias", "distilbert.transformer.layer.2.attention.out_lin.weight", "distilbert.transformer.layer.2.attention.out_lin.bias", "distilbert.transformer.layer.2.sa_layer_norm.weight", "distilbert.transformer.layer.2.sa_layer_norm.bias", "distilbert.transformer.layer.2.ffn.lin1.weight", "distilbert.transformer.layer.2.ffn.lin1.bias", "distilbert.transformer.layer.2.ffn.lin2.weight", "distilbert.transformer.layer.2.ffn.lin2.bias", "distilbert.transformer.layer.2.output_layer_norm.weight", "distilbert.transformer.layer.2.output_layer_norm.bias", "distilbert.transformer.layer.3.attention.q_lin.weight", "distilbert.transformer.layer.3.attention.q_lin.bias", "distilbert.transformer.layer.3.attention.k_lin.weight", "distilbert.transformer.layer.3.attention.k_lin.bias", "distilbert.transformer.layer.3.attention.v_lin.weight", "distilbert.transformer.layer.3.attention.v_lin.bias", "distilbert.transformer.layer.3.attention.out_lin.weight", "distilbert.transformer.layer.3.attention.out_lin.bias", "distilbert.transformer.layer.3.sa_layer_norm.weight", "distilbert.transformer.layer.3.sa_layer_norm.bias", "distilbert.transformer.layer.3.ffn.lin1.weight", "distilbert.transformer.layer.3.ffn.lin1.bias", "distilbert.transformer.layer.3.ffn.lin2.weight", "distilbert.transformer.layer.3.ffn.lin2.bias", "distilbert.transformer.layer.3.output_layer_norm.weight", "distilbert.transformer.layer.3.output_layer_norm.bias", "distilbert.transformer.layer.4.attention.q_lin.weight", "distilbert.transformer.layer.4.attention.q_lin.bias", "distilbert.transformer.layer.4.attention.k_lin.weight", "distilbert.transformer.layer.4.attention.k_lin.bias", "distilbert.transformer.layer.4.attention.v_lin.weight", "distilbert.transformer.layer.4.attention.v_lin.bias", "distilbert.transformer.layer.4.attention.out_lin.weight", "distilbert.transformer.layer.4.attention.out_lin.bias", "distilbert.transformer.layer.4.sa_layer_norm.weight", "distilbert.transformer.layer.4.sa_layer_norm.bias", "distilbert.transformer.layer.4.ffn.lin1.weight", "distilbert.transformer.layer.4.ffn.lin1.bias", "distilbert.transformer.layer.4.ffn.lin2.weight", "distilbert.transformer.layer.4.ffn.lin2.bias", "distilbert.transformer.layer.4.output_layer_norm.weight", "distilbert.transformer.layer.4.output_layer_norm.bias", "distilbert.transformer.layer.5.attention.q_lin.weight", "distilbert.transformer.layer.5.attention.q_lin.bias", "distilbert.transformer.layer.5.attention.k_lin.weight", "distilbert.transformer.layer.5.attention.k_lin.bias", "distilbert.transformer.layer.5.attention.v_lin.weight", "distilbert.transformer.layer.5.attention.v_lin.bias", "distilbert.transformer.layer.5.attention.out_lin.weight", "distilbert.transformer.layer.5.attention.out_lin.bias", "distilbert.transformer.layer.5.sa_layer_norm.weight", "distilbert.transformer.layer.5.sa_layer_norm.bias", "distilbert.transformer.layer.5.ffn.lin1.weight", "distilbert.transformer.layer.5.ffn.lin1.bias", "distilbert.transformer.layer.5.ffn.lin2.weight", "distilbert.transformer.layer.5.ffn.lin2.bias", "distilbert.transformer.layer.5.output_layer_norm.weight", "distilbert.transformer.layer.5.output_layer_norm.bias", "pre_classifier.weight", "pre_classifier.bias", "classifier.weight", "classifier.bias". 
	Unexpected key(s) in state_dict: "distilbert_model.embeddings.word_embeddings.weight", "distilbert_model.embeddings.position_embeddings.weight", "distilbert_model.embeddings.LayerNorm.weight", "distilbert_model.embeddings.LayerNorm.bias", "distilbert_model.transformer.layer.0.attention.q_lin.weight", "distilbert_model.transformer.layer.0.attention.q_lin.bias", "distilbert_model.transformer.layer.0.attention.k_lin.weight", "distilbert_model.transformer.layer.0.attention.k_lin.bias", "distilbert_model.transformer.layer.0.attention.v_lin.weight", "distilbert_model.transformer.layer.0.attention.v_lin.bias", "distilbert_model.transformer.layer.0.attention.out_lin.weight", "distilbert_model.transformer.layer.0.attention.out_lin.bias", "distilbert_model.transformer.layer.0.sa_layer_norm.weight", "distilbert_model.transformer.layer.0.sa_layer_norm.bias", "distilbert_model.transformer.layer.0.ffn.lin1.weight", "distilbert_model.transformer.layer.0.ffn.lin1.bias", "distilbert_model.transformer.layer.0.ffn.lin2.weight", "distilbert_model.transformer.layer.0.ffn.lin2.bias", "distilbert_model.transformer.layer.0.output_layer_norm.weight", "distilbert_model.transformer.layer.0.output_layer_norm.bias", "distilbert_model.transformer.layer.1.attention.q_lin.weight", "distilbert_model.transformer.layer.1.attention.q_lin.bias", "distilbert_model.transformer.layer.1.attention.k_lin.weight", "distilbert_model.transformer.layer.1.attention.k_lin.bias", "distilbert_model.transformer.layer.1.attention.v_lin.weight", "distilbert_model.transformer.layer.1.attention.v_lin.bias", "distilbert_model.transformer.layer.1.attention.out_lin.weight", "distilbert_model.transformer.layer.1.attention.out_lin.bias", "distilbert_model.transformer.layer.1.sa_layer_norm.weight", "distilbert_model.transformer.layer.1.sa_layer_norm.bias", "distilbert_model.transformer.layer.1.ffn.lin1.weight", "distilbert_model.transformer.layer.1.ffn.lin1.bias", "distilbert_model.transformer.layer.1.ffn.lin2.weight", "distilbert_model.transformer.layer.1.ffn.lin2.bias", "distilbert_model.transformer.layer.1.output_layer_norm.weight", "distilbert_model.transformer.layer.1.output_layer_norm.bias", "distilbert_model.transformer.layer.2.attention.q_lin.weight", "distilbert_model.transformer.layer.2.attention.q_lin.bias", "distilbert_model.transformer.layer.2.attention.k_lin.weight", "distilbert_model.transformer.layer.2.attention.k_lin.bias", "distilbert_model.transformer.layer.2.attention.v_lin.weight", "distilbert_model.transformer.layer.2.attention.v_lin.bias", "distilbert_model.transformer.layer.2.attention.out_lin.weight", "distilbert_model.transformer.layer.2.attention.out_lin.bias", "distilbert_model.transformer.layer.2.sa_layer_norm.weight", "distilbert_model.transformer.layer.2.sa_layer_norm.bias", "distilbert_model.transformer.layer.2.ffn.lin1.weight", "distilbert_model.transformer.layer.2.ffn.lin1.bias", "distilbert_model.transformer.layer.2.ffn.lin2.weight", "distilbert_model.transformer.layer.2.ffn.lin2.bias", "distilbert_model.transformer.layer.2.output_layer_norm.weight", "distilbert_model.transformer.layer.2.output_layer_norm.bias", "distilbert_model.transformer.layer.3.attention.q_lin.weight", "distilbert_model.transformer.layer.3.attention.q_lin.bias", "distilbert_model.transformer.layer.3.attention.k_lin.weight", "distilbert_model.transformer.layer.3.attention.k_lin.bias", "distilbert_model.transformer.layer.3.attention.v_lin.weight", "distilbert_model.transformer.layer.3.attention.v_lin.bias", "distilbert_model.transformer.layer.3.attention.out_lin.weight", "distilbert_model.transformer.layer.3.attention.out_lin.bias", "distilbert_model.transformer.layer.3.sa_layer_norm.weight", "distilbert_model.transformer.layer.3.sa_layer_norm.bias", "distilbert_model.transformer.layer.3.ffn.lin1.weight", "distilbert_model.transformer.layer.3.ffn.lin1.bias", "distilbert_model.transformer.layer.3.ffn.lin2.weight", "distilbert_model.transformer.layer.3.ffn.lin2.bias", "distilbert_model.transformer.layer.3.output_layer_norm.weight", "distilbert_model.transformer.layer.3.output_layer_norm.bias", "distilbert_model.transformer.layer.4.attention.q_lin.weight", "distilbert_model.transformer.layer.4.attention.q_lin.bias", "distilbert_model.transformer.layer.4.attention.k_lin.weight", "distilbert_model.transformer.layer.4.attention.k_lin.bias", "distilbert_model.transformer.layer.4.attention.v_lin.weight", "distilbert_model.transformer.layer.4.attention.v_lin.bias", "distilbert_model.transformer.layer.4.attention.out_lin.weight", "distilbert_model.transformer.layer.4.attention.out_lin.bias", "distilbert_model.transformer.layer.4.sa_layer_norm.weight", "distilbert_model.transformer.layer.4.sa_layer_norm.bias", "distilbert_model.transformer.layer.4.ffn.lin1.weight", "distilbert_model.transformer.layer.4.ffn.lin1.bias", "distilbert_model.transformer.layer.4.ffn.lin2.weight", "distilbert_model.transformer.layer.4.ffn.lin2.bias", "distilbert_model.transformer.layer.4.output_layer_norm.weight", "distilbert_model.transformer.layer.4.output_layer_norm.bias", "distilbert_model.transformer.layer.5.attention.q_lin.weight", "distilbert_model.transformer.layer.5.attention.q_lin.bias", "distilbert_model.transformer.layer.5.attention.k_lin.weight", "distilbert_model.transformer.layer.5.attention.k_lin.bias", "distilbert_model.transformer.layer.5.attention.v_lin.weight", "distilbert_model.transformer.layer.5.attention.v_lin.bias", "distilbert_model.transformer.layer.5.attention.out_lin.weight", "distilbert_model.transformer.layer.5.attention.out_lin.bias", "distilbert_model.transformer.layer.5.sa_layer_norm.weight", "distilbert_model.transformer.layer.5.sa_layer_norm.bias", "distilbert_model.transformer.layer.5.ffn.lin1.weight", "distilbert_model.transformer.layer.5.ffn.lin1.bias", "distilbert_model.transformer.layer.5.ffn.lin2.weight", "distilbert_model.transformer.layer.5.ffn.lin2.bias", "distilbert_model.transformer.layer.5.output_layer_norm.weight", "distilbert_model.transformer.layer.5.output_layer_norm.bias", "linear.weight", "linear.bias". 