# Load data

In [None]:
import json
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed, GlobalAveragePooling1D, Input, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, multilabel_classification_report


In [None]:
# Tạo từ điển ánh xạ nhãn BIO thành chỉ số
bio_labels = [
    "O",
    "B-address", "B-area", "B-arriveby", "B-bookday", "B-bookpeople", "B-bookstay", "B-booktime",
    "B-choice", "B-day", "B-department", "B-departure", "B-destination", "B-duration", "B-entrancefee", 
    "B-food", "B-leaveat", "B-name", "B-openhours", "B-phone", "B-postcode", "B-price", "B-pricerange", 
    "B-ref", "B-stars", "B-trainid", "B-type", "I-address", "I-area", "I-arriveby", "I-bookday", 
    "I-bookpeople", "I-bookstay", "I-booktime", "I-choice", "I-day", "I-department", "I-departure", 
    "I-destination", "I-duration", "I-entrancefee", "I-food", "I-leaveat", "I-name", "I-openhours", 
    "I-phone", "I-postcode", "I-price", "I-pricerange", "I-ref", "I-stars", "I-trainid", "I-type"
]

# Tạo từ điển ánh xạ nhãn act type thành chỉ số
unique_act_types = [
    "None", "Attraction-Inform", "Attraction-NoOffer", "Attraction-Recommend", "Attraction-Select",
    "Booking-Book", "Booking-Inform", "Booking-NoBook", "Hospital-Inform", "Hotel-Inform", "Hotel-NoOffer", 
    "Hotel-Recommend", "Hotel-Select", "Police-Inform", "Restaurant-Inform", "Restaurant-NoOffer", 
    "Restaurant-Recommend", "Restaurant-Select", "Taxi-Inform", "Train-Inform", "Train-NoOffer", 
    "Train-OfferBook", "Train-OfferBooked", "Train-Select"
]

def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def create_bio_labels(utterance, spans, slot_names):
    # Tách câu thành các token
    tokens = re.findall(r'\w+|[^\w\s]', utterance, re.UNICODE)
    
    # Khởi tạo nhãn BIO cho các token
    bio_labels = ['O'] * len(tokens)
    
    # Duyệt qua từng span
    for start, end, slot_name in zip(spans['span_start'], spans['span_end'], slot_names):
        # Xác định vị trí token đầu tiên và cuối cùng của span
        token_start = len(re.findall(r'\w+|[^\w\s]', utterance[:start]))
        token_end = len(re.findall(r'\w+|[^\w\s]', utterance[:end]))
        
        # Gán nhãn BIO cho các token trong span
        if token_start < len(tokens):
            bio_labels[token_start] = f'B-{slot_name}'
        for i in range(token_start + 1, min(token_end, len(tokens))):
            bio_labels[i] = f'I-{slot_name}'
    
    return tokens, bio_labels

def get_raw_data1(dataset):
    data_train = []
    data_validation = []
    data_test = []

    for split in ["train", "validation", "test"]:
        if split == "train":
            data = data_train
        elif split == "validation":
            data = data_validation
        elif split == "test":
            data = data_test

        for index_dialogue in range(len(dataset[split]["dialogue_id"])):
            dialogue_id = dataset[split]["dialogue_id"][index_dialogue]
            for index_turn in range(len(dataset[split]["turns"][index_dialogue]['turn_id'])):
                record = [None] * 4
                record[0] = dialogue_id
                record[1] = dataset[split]["turns"][index_dialogue]["turn_id"][index_turn]
                record[2] = dataset[split]["turns"][index_dialogue]["utterance"][index_turn]
                record[3] = dataset[split]["turns"][index_dialogue]["dialogue_acts"][index_turn]["span_info"]
                data.append(record)

    return data_train, data_validation, data_test


def get_raw_data2(data):
    # data is data_train, data_validation, or data_test
    X_token_classification = []
    y_token_classification = []
    y_act = []

    # Tạo từ điển ánh xạ nhãn BIO thành chỉ số
    label_to_id = {label: idx for idx, label in enumerate(bio_labels)}

    # Tạo từ điển ánh xạ nhãn act_type thành chỉ số
    act_type_to_id = {act_type: idx for idx, act_type in enumerate(unique_act_types)}

    # Lặp qua các dữ liệu train
    for item in data:
        # Câu thoại (utterance)
        utterance = item[2]  # Câu trong vị trí thứ 2

        # Các nhãn act_type và act_slot_name từ `dialogue_acts`
        dialogue_acts = item[3]  # Nhận phần dialogue_acts
        span_start = dialogue_acts["span_start"]
        span_end = dialogue_acts["span_end"]
        act_slot_name = dialogue_acts["act_slot_name"]
        act_type = dialogue_acts["act_type"]  # Nhãn act_type

        # Gán nhãn BIO cho các token trong câu
        tokens, bio_labels_for_tokens = create_bio_labels(
            utterance, {"span_start": span_start, "span_end": span_end}, act_slot_name
        )

        # Lưu các token và nhãn BIO
        X_token_classification.append(tokens)

        # Chuyển nhãn BIO thành chỉ số
        y_token_classification.append(
            [label_to_id[label] for label in bio_labels_for_tokens]
        )

        # Khởi tạo ma trận one-hot cho các hành động
        act_type_one_hot = np.zeros(len(unique_act_types))

        # Nếu có nhiều hành động, đặt giá trị 1 cho những hành động xuất hiện trong `act_type`
        for act in act_type:
            act_type_id = act_type_to_id[act]
            act_type_one_hot[act_type_id] = 1  # Gán 1 cho các vị trí của các hành động xuất hiện

        # Append vào y_act
        y_act.append(act_type_one_hot)

    return X_token_classification, y_token_classification, y_act

def get_raw_data():
    dataset = load_json('dataset_multiwoz.json')
    data_train, data_validation, data_test = get_raw_data1(dataset)
    X_train, y_train, y_train_act = get_raw_data2(data_train)
    X_validation, y_validation, y_validation_act = get_raw_data2(data_validation)
    X_test, y_test, y_test_act = get_raw_data2(data_test)
    return X_train, y_train_slot, y_train_act, X_validation, y_validation_slot, y_validation_act, X_test, y_test_slot, y_test_act

In [None]:
X_train, y_train_slot, y_train_act, X_validation, y_validation_slot, y_validation_act, X_test, y_test_slot, y_test_act = get_raw_data()

In [None]:
def setup_hyperparameters(embedding_dim, max_sequence_length, num_classes_act, num_classes_slot, dropout_rate):
    # Hyperparameters
    EMBEDDING_DIM = embedding_dim
    MAX_SEQUENCE_LENGTH = max_sequence_length
    NUM_CLASSES_ACT = num_classes_act
    NUM_CLASSES_SLOT = num_classes_slot
    DROPOUT_RATE = dropout_rate
    return EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, NUM_CLASSES_ACT, NUM_CLASSES_SLOT, DROPOUT_RATE

EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, NUM_CLASSES_ACT, NUM_CLASSES_SLOT, DROPOUT_RATE = setup_hyperparameters(100, 50, 25, 50, 0.2)

# Sử dụng BERT embedding (với `bert-base-uncased`)
from transformers import BertTokenizer, TFBertModel
bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertModel.from_pretrained(bert_model_name)

def create_bert_embedding(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = bert_tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_SEQUENCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'][0])
        attention_masks.append(encoded_dict['attention_mask'][0])
    input_ids = tf.stack(input_ids)
    attention_masks = tf.stack(attention_masks)
    embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]
    return embeddings

X_train_bert = create_bert_embedding(X_train)
X_val_bert = create_bert_embedding(X_validation)
X_test_bert = create_bert_embedding(X_test)

# Chuẩn bị dữ liệu (Padding và one-hot encoding)
y_train_padded = pad_sequences(y_token_classification_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
y_train_padded_one_hot = np.array([to_categorical(y, num_classes=num_slot_labels) for y in y_train_padded])

y_val_padded = pad_sequences(y_token_classification_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
y_val_padded_one_hot = np.array([to_categorical(y, num_classes=num_slot_labels) for y in y_val_padded])

y_test_padded = pad_sequences(y_token_classification_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
y_test_padded_one_hot = np.array([to_categorical(y, num_classes=num_slot_labels) for y in y_test_padded])

y_act_one_hot = np.array(y_act_train)
y_act_val_one_hot = np.array(y_act_val)
y_act_test_one_hot = np.array(y_act_test)

# Tính class weights cho slot classification
from sklearn.utils import class_weight
slot_class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_padded.flatten()), y=y_train_padded.flatten())
slot_class_weights_dict = dict(enumerate(slot_class_weights))

# Tính class weights cho act type classification (multi-label)
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_act_mlb = mlb.fit_transform([np.where(r==1)[0] for r in y_act_one_hot])
act_class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_act_mlb.flatten()), y=y_act_mlb.flatten())
act_class_weights_dict = dict(enumerate(act_class_weights))

# Model (Sử dụng BERT embedding và dropout)
input_seq = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedding = bert_model(input_seq)[0]
bilstm = Bidirectional(LSTM(128, return_sequences=True))(embedding) # Tăng số units
bilstm = Dropout(DROPOUT_RATE)(bilstm) # Thêm dropout

slot_output = TimeDistributed(Dense(NUM_CLASSES_SLOT, activation="softmax"), name="slot_output")(bilstm)

sentence_representation = GlobalAveragePooling1D()(bilstm)
sentence_representation = Dropout(DROPOUT_RATE)(sentence_representation) # Thêm dropout
action_output = Dense(NUM_CLASSES_ACT, activation="sigmoid", name="action_output")(sentence_representation) # Sigmoid cho multi-label

model = Model(inputs=input_seq, outputs=[slot_output, action_output])

# Compile model với class weights
model.compile(
    optimizer="adam",
    loss={
        "slot_output": "categorical_crossentropy",
        "action_output": "binary_crossentropy", # Binary crossentropy cho multi-label
    },
    loss_weights={
        "slot_output": 1.0,
        "action_output": 1.0
    },
    metrics={"slot_output": ["accuracy"], "action_output": ["binary_accuracy"]}, # Binary accuracy cho multi-label
)

model.summary()

# Huấn luyện mô hình
history = model.fit(
    X_train_bert,
    [y_train_padded_one_hot, y_act_one_hot],
    validation_data=(X_val_bert, [y_val_padded_one_hot, y_act_val_one_hot]),
    epochs=20, # Tăng số epochs
    batch_size=32, # Giảm batch size nếu cần
    class_weight=[slot_class_weights_dict, act_class_weights_dict]
)

# Đánh giá và in report (chỉnh sửa cho multi-label)
y_pred_slot, y_pred_action = model.predict(X_test_bert)

y_pred_slot_labels = np.argmax(y_pred_slot, axis=-1)
y_true_slot_labels = np.argmax(y_test_padded_one_hot, axis=-1)
y_pred_slot_flatten = y_pred_slot_labels.reshape(-1)
y_true_slot_flatten = y_true_slot_labels.reshape(-1)

print("Slot Classification Metrics:")
print(classification_report(y_true_slot_flatten, y_pred_slot_flatten, zero_division=0))

y_pred_action_labels = (y_pred_action > 0.5).astype(int) # Ngưỡng 0.5 cho sigmoid
print("Action Classification Metrics:")
print(multilabel_classification_report(y_act_test_one_hot, y_pred_action_labels, zero_division=0))

# ... (Phần test với câu mẫu giữ nguyên, chỉ cần thay X_train_padded bằng X_train_bert)