In [1]:
import os
import json
import pandas as pd
import sys
import torch
from transformers import BertTokenizer, BertModel


module_path = os.path.abspath(os.path.join('D:/julixus/MEISD/meisd_project/reprocessing.py'))
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocessing

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
MAX_LEN = 50
BATCH = 8
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [3]:
def load_data(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        data = json.load(file)
    return data

dataset = load_data("MEISD/ESConv.json")
dataframe = pd.DataFrame(dataset)

In [4]:
emotion_df = preprocessing.dataset_to_df(dataset)
emotion_df['Sentiment'] = emotion_df['Emotion Type'].apply(preprocessing.emotionToSentiment)

Unnamed: 0,Conversation ID,Emotion Type,Sentiment
0,1,anxiety,negative
1,2,anger,negative
2,3,fear,negative
3,4,depression,negative
4,5,depression,negative


In [5]:
seeker_dataframe = preprocessing.seeker_df(dataset)

final_df = pd.merge(
    seeker_dataframe,
    emotion_df[['Conversation ID', 'Sentiment']],
    left_on='ConversationID',
    right_on='Conversation ID',
    how='inner'
)

final_df.drop(columns=['Conversation ID'], inplace=True)

final_df.head()

Unnamed: 0,ConversationID,Seeker Dialog,Sentiment
0,1,hello im looking for someone to talk to,negative
1,1,im fine how are you\n,negative
2,1,thats great and no its not snowing its very co...,negative
3,1,merry christmas to you also \n,negative
4,1,im having some issues with friends not actuall...,negative


In [6]:
sentiment_map = {
    'positive': 1,
    'negative': 0,
    #'neutral': 2
}
final_df.loc[:, 'Sentiment'] = pd.to_numeric(final_df['Sentiment'].map(sentiment_map).fillna(2).astype(int))

In [7]:
df_data = final_df[['Seeker Dialog', 'Sentiment']].copy()
df_data.rename(columns={'Seeker Dialog': 'Utterances', 'Sentiment': 'label'}, inplace=True)
df_data.head()

Unnamed: 0,Utterances,label
0,hello im looking for someone to talk to,0
1,im fine how are you\n,0
2,thats great and no its not snowing its very co...,0
3,merry christmas to you also \n,0
4,im having some issues with friends not actuall...,0


In [8]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterances = str(self.utterances[index]) 

        inputs = self.tokenizer.encode_plus(
            utterances,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        target = torch.tensor(self.targets[index], dtype=torch.long)  

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),  
            'utterances': utterances
        }

In [9]:
from sklearn.model_selection import train_test_split

# split into train and test
df_train, df_test = train_test_split(df_data, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [10]:
class BERTSentimentClass(torch.nn.Module):
    def __init__(self):
        super(BERTSentimentClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)
        self.dropout = torch.nn.Dropout(p=0.3) #0.5
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, 3)


    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        #pooler_output = self.pooler_output
        dropout_output = self.dropout(output.pooler_output)
        linear_output = self.linear(dropout_output)
        #output = self.dropout(linear_output)
        # output = self.softmax(linear_output)
        return linear_output


In [11]:
# Load the model
model_path = 'best_model_state.bin'
model = BERTSentimentClass()
model.load_state_dict(torch.load(model_path))
model.to(device)
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [12]:
escov_dataset = CustomDataset(df_data, tokenizer, MAX_LEN)
escov_data_loader = torch.utils.data.DataLoader(escov_dataset, batch_size=BATCH, shuffle=False)

In [13]:
# Function to predict
def predict(model, data_loader):
    model.eval() 
    predictions = []

    with torch.no_grad():  
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)

            preds = torch.argmax(outputs, dim=1)  
            predictions.extend(preds.cpu().numpy())  

    return predictions

# Get predictions for ESCov dataset
predictions = predict(model, escov_data_loader)


In [14]:
from sklearn.metrics import f1_score

true_labels = df_data['label'].values

# Calculate F1 score
f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1}")


ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

In [16]:
import numpy as np
print("True labels unique:", np.unique(true_labels))

print("Predicted labels unique:", np.unique(true_labels))


True labels unique: [0]
Predicted labels unique: [0]


In [17]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predictions)
print(report)


ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(true_labels, predictions)


ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

In [20]:
y_true = np.argmax(true_labels, axis=1)  # Jeśli y_true jest w formacie One-Hot
y_pred = np.argmax(predictions, axis=1)  # Jeśli y_pred jest w formacie One-Hot


AxisError: axis 1 is out of bounds for array of dimension 1