In [1]:
import ray
from ray import tune

import torch.optim as optim
import pandas as pd
import numpy as np
import shutil

import tqdm.notebook as tq
from collections import defaultdict

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
# Hyperparameters
MAX_LEN = 30  #50 #128  # wiekszosc tokenow zdaje sie byc ponizej 40, klasycznie wklada sie tu 256, my przystaniemy na 30
TRAIN_BATCH_SIZE = 32  #8 #16 #32 
#Czasami, przy bardzo niskim tempie uczenia i zbyt dużych batchach, model może wolniej konwergować. Spróbuj zmniejszyć wielkość batcha, np. z 16 do 8.
VALID_BATCH_SIZE = 32  #8 #16 #32
TEST_BATCH_SIZE = 32  #8 #16 #32
EPOCHS = 10
#LEARNING_RATE = 1e-05  #1e-05
THRESHOLD = 0.5  # threshold for the sigmoid

In [3]:
df_data = pd.read_csv('multi_label_binarizer_MEISD.csv')
df_data.head()
# For the multilabel classification we use:
columns = ['Utterances', 'sentiment_0', 'sentiment_1', 'sentiment_2']
multi_columns = df_data[columns].copy()
multi_columns
df_data['label'] = multi_columns[['sentiment_0', 'sentiment_1', 'sentiment_2']].idxmax(axis=1)
df_data['label'] = df_data['label'].apply(lambda x: int(x.split('_')[1]))
df_data = df_data[['Utterances', 'label']]

In [4]:
print(df_data.isnull().sum())

Utterances    0
label         0
dtype: int64


In [5]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

if tokenizer is None:
    raise ValueError("Failed to load tokenizer. Ensure the model name is correct and Hugging Face's transformers library is properly installed.")


test_text = "We are testing BERT tokenizer."
encodings = tokenizer.encode_plus(test_text,
                                  add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                  max_length=50,
                                  truncation=True,
                                  padding="max_length",
                                  return_attention_mask=True,
                                  return_tensors="pt")
token_lens = []

for txt in df_data['Utterances']:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

In [6]:
df_train, df_test = train_test_split(df_data, random_state=77, test_size=0.30, shuffle=True)
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

columns = multi_columns.columns

categor_freq = multi_columns[columns[1:]].sum() / multi_columns.shape[0]
categor_freq
class_distribution = multi_columns[['sentiment_0', 'sentiment_1', 'sentiment_2']].sum()

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.utterances = list(df['Utterances'])
        self.targets = self.df['label'].astype(int).values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, index):
        utterance = self.utterances[index]

        # Convert the utterance to a string to avoid any non-string types
        utterance = str(utterance)

        try:
            inputs = self.tokenizer.encode_plus(
                utterance,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                return_token_type_ids=True,
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
        except Exception as e:
            print(f"Tokenization error at index {index} for utterance: '{utterance}'")
            print(f"Exception: {e}")
            raise

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
        }


target_list = list(df_data.columns)
target_list = target_list[1:]

train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN)


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0  # 0 means no parallel loading
)

val_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,  # Validation data should not be shuffled
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

data = next(iter(train_data_loader))

In [7]:
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
last_hidden_state, pooled_output = bert_model(
    input_ids=encodings['input_ids'],
    attention_mask=encodings['attention_mask']
)
bert_model.config.hidden_size

class BERTSentimentClass(torch.nn.Module):
    def __init__(self):
        super(BERTSentimentClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(p=0.3)
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, 3)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = output.pooler_output  # Corrected here
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

model = BERTSentimentClass()

# # Freezing BERT layers:
#for name, param in model.bert_model.named_parameters():
#    if "encoder.layer.10" in name or "encoder.layer.11" in name:
#        param.requires_grad = True
#    else:
#        param.requires_grad = False

model.to(device)

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

In [8]:
class_distribution = multi_columns[['sentiment_0', 'sentiment_1', 'sentiment_2']].sum()
total_samples = sum(class_distribution)
class_weights = [total_samples / count for count in class_distribution]
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss(weight=class_weights)(outputs, targets)

writer = SummaryWriter(log_dir='logs')

optimizer = AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5, verbose=True)



In [9]:
def eval_model(model, val_data_loader):
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_data_loader:
            inputs = batch['input_ids'].to(device)
            attn_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['targets'].to(device)

            outputs = model(inputs, attn_mask, token_type_ids)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_val_loss, accuracy

@ray.remote
def train_model_with_validation(config):
    global train_loader, val_loader  # Use the pre-defined loaders
    learning_rate = config["lr"]
    batch_size = config["batch_size"]

    model = BERTSentimentClass().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            inputs = batch['input_ids'].to(device)
            attn_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['targets'].to(device)

            optimizer.zero_grad()
            outputs = model(inputs, attn_mask, token_type_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Evaluate on validation data
        val_loss, val_accuracy = eval_model(model, val_loader)

        # Log using Ray Tune
        tune.report(loss=running_loss/len(train_loader), val_loss=val_loss, val_accuracy=val_accuracy)

config = {
    "lr": tune.loguniform(1e-5, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
}

In [15]:
ray.init(ignore_reinit_error=True, log_to_driver=True, _temp_dir="D:/julixus/MEISD/meisd_project/ray_temp")

analysis = tune.run(
    tune.with_parameters(train_model_with_validation, train_loader=train_data_loader, val_loader=val_data_loader),
    config=config,
    num_samples=1,  # Liczba eksperymentów
    resources_per_trial={"cpu": 1},
    name="BERT_Sentiment_Experiment",
    local_dir= 'D:/julixus/MEISD/meisd_project/ray_tune'
)

2024-11-06 18:30:55,284	INFO worker.py:1816 -- Started a local Ray instance.


DeprecationWarning: The `local_dir` argument is deprecated. You should set the `storage_path` instead. See the docs: https://docs.ray.io/en/latest/train/user-guides/persistent-storage.html#setting-the-local-staging-directory

In [13]:
print("Best config: ", analysis.get_best_config(metric="loss", mode="min"))

NameError: name 'analysis' is not defined

In [ ]:
print(f"Best trial final validation loss: {analysis.best_trial.last_result['loss']}")