# Check system information

In [2]:
import sys
import platform
import psutil
import subprocess
import importlib.util

def get_os_info():
    """Retrieves operating system information."""
    os_name = platform.system()
    os_version = platform.release()
    return os_name, os_version

def get_python_info():
    """Retrieves Python version information."""
    python_version = sys.version
    return python_version

def get_cpu_info():
    """Retrieves CPU information."""
    cpu_name = platform.processor()
    cpu_cores = psutil.cpu_count(logical=False) 
    cpu_threads = psutil.cpu_count(logical=True) 
    return cpu_name, cpu_cores, cpu_threads

def get_memory_info():
    """Retrieves memory information."""
    mem = psutil.virtual_memory()
    total_memory_gb = mem.total / (1024 ** 3) 
    available_memory_gb = mem.available / (1024 ** 3)
    return total_memory_gb, available_memory_gb

def get_gpu_info():
    """Retrieves GPU information using nvidia-smi command."""
    try:
        subprocess.run(["nvidia-smi", "-h"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        result = subprocess.run(["nvidia-smi", "--query-gpu=name,driver_version", "--format=csv,noheader"],
                                 capture_output=True, text=True, check=True)
        cuda_result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=True)
        cuda_version_line = next((line for line in cuda_result.stdout.split('\n') if "CUDA Version" in line), None)
        cuda_version = cuda_version_line.split(":")[-1].strip() if cuda_version_line else "N/A"

        gpus = []
        for line in result.stdout.strip().split('\n'):
            name, driver_version = line.split(', ')
            gpus.append({'name': name, 'driver_version': driver_version})
        return gpus, cuda_version
    except (subprocess.CalledProcessError, FileNotFoundError):
        return "N/A", "N/A"

def check_deepspeed():
    """Checks for DeepSpeed installation and version."""
    deepspeed_spec = importlib.util.find_spec("deepspeed")
    if deepspeed_spec is not None:
        try:
            import deepspeed
            return deepspeed.__version__
        except ImportError:
            return "Installed, but version cannot be determined"
    else:
        return "Not Installed"

def check_tensorflow():
    """Checks for TensorFlow installation and version."""
    try:
        import tensorflow as tf
        return tf.__version__
    except ImportError:
        return "Not Installed"

def main():
    """Prints system information."""
    os_name, os_version = get_os_info()
    python_version = get_python_info()
    cpu_name, cpu_cores, cpu_threads = get_cpu_info()
    total_memory_gb, available_memory_gb = get_memory_info()
    gpus, cuda_version = get_gpu_info()
    deepspeed_version = check_deepspeed()
    tensorflow_version = check_tensorflow()

    print("--- System Information ---")
    print(f"Operating System: {os_name} {os_version}")
    print(f"Python Version: {python_version}")
    print("--- CPU Information ---")
    print(f"CPU: {cpu_name}")
    print(f"Physical Cores: {cpu_cores}")
    print(f"Logical Threads: {cpu_threads}")
    print("--- Memory Information ---")
    print(f"Total Memory: {total_memory_gb:.2f} GB")
    print(f"Available Memory: {available_memory_gb:.2f} GB")
    print("--- GPU Information ---")
    if gpus == "N/A":
        print("GPU: N/A (No NVIDIA GPU or nvidia-smi not found)")
        print(f"CUDA Version: N/A")
    else:
        for i, gpu in enumerate(gpus):
            print(f"GPU {i + 1}: {gpu['name']}, Driver Version: {gpu['driver_version']}")
        print(f"CUDA Version: {cuda_version}")
    print("--- DeepSpeed Information ---")
    print(f"DeepSpeed Version: {deepspeed_version}")
    print("--- TensorFlow Information ---")
    print(f"TensorFlow Version: {tensorflow_version}")
main()
!nvidia-smi

[2025-05-26 12:53:29,164] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2025-05-26 12:53:33.543579: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-26 12:53:34.166014: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-26 12:53:34.166150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-26 12:53:34.256742: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-26 12:53:34.462734: I tensorflow/core/platform/cpu_feature_guar

--- System Information ---
Operating System: Linux 5.19.0-45-generic
Python Version: 3.11.7 (main, Dec  8 2023, 18:56:58) [GCC 11.4.0]
--- CPU Information ---
CPU: x86_64
Physical Cores: 8
Logical Threads: 8
--- Memory Information ---
Total Memory: 44.08 GB
Available Memory: 18.54 GB
--- GPU Information ---
GPU 1: NVIDIA RTX A4000, Driver Version: 550.144.03
CUDA Version: 12.4     |
--- DeepSpeed Information ---
DeepSpeed Version: 0.10.3
--- TensorFlow Information ---
TensorFlow Version: 2.15.0
Mon May 26 12:53:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|       

In [42]:
! du -h --max-depth=1 /notebooks

322M	/notebooks/MultiWOZ-coref
9.9M	/notebooks/.ipynb_checkpoints
341M	/notebooks


In [41]:
rm -rf /notebooks/.Trash-0

# Data Acquisition and Verification

In [3]:
!git clone https://github.com/lexmen318/MultiWOZ-coref.git

Cloning into 'MultiWOZ-coref'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 60 (delta 19), reused 4 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 29.40 MiB | 14.42 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [2]:
zip_file_path = "MultiWOZ-coref/MultiWOZ2_3.zip"
extraction_path = "MultiWOZ-coref"

zip_ref = zipfile.ZipFile(zip_file_path, 'r')
print(f"Successfully opened the zip file: {zip_file_path}")

os.makedirs(extraction_path, exist_ok=True)

zip_ref.extractall(extraction_path)
print(f"\nExtracted all files to: {extraction_path}")

print("\nContents of the extracted directory:")
for item in os.listdir(extraction_path):
    print(f"- {item}")

Successfully opened the zip file: MultiWOZ-coref/MultiWOZ2_3.zip

Extracted all files to: MultiWOZ-coref

Contents of the extracted directory:
- appendix_new.pdf
- README.md
- MultiWOZ2_3.zip
- .git
- MultiWOZ2_3


# Install and import necessary libraries

In [1]:
!pip install pyspellchecker
!pip install nlpaug
!pip install dateparser
!pip install CurrencyConverter
!pip install dateparser word2number
!pip install contractions
!pip install gensim
!pip install GPUtil
!pip install rouge_score
!pip install nvidia-ml-py3
!pip install pynvml
import json
import os
import re
import random
import zipfile
import contractions
import dateparser
from word2number import w2n
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, Value, Sequence, ClassLabel
import pickle
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import word_tokenize
from collections import defaultdict
import nltk
import re
from typing import List, Dict, Any

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

[0m

2025-07-04 05:54:34.189923: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-04 05:54:34.680307: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-04 05:54:34.680418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-04 05:54:34.765220: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-04 05:54:34.932189: I tensorflow/core/platform/cpu_feature_guar

True

# Split data into train and test dataset

In [2]:
dataset_dir = "MultiWOZ-coref/MultiWOZ2_3"
data_file = os.path.join(dataset_dir, "data.json")
ontology_file = os.path.join(dataset_dir, "ontology.json")
dialogue_acts_file = os.path.join(dataset_dir, "dialogue_acts.json")

with open(data_file, 'r') as f:
    data = json.load(f)
with open(ontology_file, 'r') as f:
    ontology = json.load(f)
with open(dialogue_acts_file, 'r') as f:
    dialogue_acts = json.load(f)

def get_primary_domain(dialogue):
    """Extract the primary domain from dialogue structure."""
    if "new_goal" in dialogue and dialogue["new_goal"]:
        for domain in dialogue["new_goal"]:
            if domain != "user_action":
                return domain
    
    if "goal" in dialogue and dialogue["goal"]:
        for domain in dialogue["goal"]:
            if domain != "user_action" and isinstance(dialogue["goal"][domain], dict):
                return domain
    domains = set()
    for turn in dialogue.get("log", []):
        metadata = turn.get("metadata", {})
        for domain in metadata:
            if domain != "user_action" and isinstance(metadata[domain], dict):
                domains.add(domain)
    return domains.pop() if domains else "unknown"

raw_data = [
    {"dialogue_id": dialogue_id, "dialogue": dialogue, "domain": get_primary_domain(dialogue)}
    for dialogue_id, dialogue in data.items()
]

train_data_raw, test_data_raw = train_test_split(
    raw_data,
    test_size=0.2,
    stratify=[d["domain"] for d in raw_data],
    random_state=42
)

In [3]:
def get_samples(dataset, num_samples):
    if not dataset:
        print("Warning: Dataset is empty. Returning an empty list.")
        return []
    if not isinstance(dataset, (list, tuple)):
        try:
            dataset_list = list(dataset)
        except TypeError:
            print("Error: Dataset could not be converted to a list for sampling.")
            return []
    else:
        dataset_list = list(dataset)
    domain_counts = {}
    for item in dataset_list:
        if 'domain' in item:
            domain = item['domain']
            domain_counts[domain] = domain_counts.get(domain, 0) + 1
        else:
            print(f"Warning: Item {item} does not have a 'domain' key. Skipping for stratification.")
    total_items = len(dataset_list)
    if total_items == 0:
        print("Warning: Dataset has no items. Returning an empty list.")
        return []
    actual_samples_to_take = min(num_samples, total_items)
    if actual_samples_to_take < num_samples:
        print(f"Warning: Dataset size ({total_items}) is less than requested samples ({num_samples}). Taking {actual_samples_to_take} samples.")
    sampled_data = []
    random.seed(42)  
    items_by_domain = {domain: [] for domain in domain_counts}
    for item in dataset_list:
        if 'domain' in item:
            items_by_domain[item['domain']].append(item)
    for domain, count in domain_counts.items():
        domain_proportion = count / total_items
        num_domain_samples = round(actual_samples_to_take * domain_proportion)
        num_domain_samples = min(num_domain_samples, len(items_by_domain[domain]))
        sampled_domain_items = random.sample(items_by_domain[domain], num_domain_samples)
        sampled_data.extend(sampled_domain_items)
    if len(sampled_data) < actual_samples_to_take:
        remaining_items = [item for item in dataset_list if item not in sampled_data]
        num_to_add = actual_samples_to_take - len(sampled_data)
        if remaining_items and num_to_add > 0:
            sampled_data.extend(random.sample(remaining_items, min(num_to_add, len(remaining_items))))
    elif len(sampled_data) > actual_samples_to_take:
        num_to_remove = len(sampled_data) - actual_samples_to_take
        for _ in range(num_to_remove):
            sampled_data.pop(random.randrange(len(sampled_data)))
    random.shuffle(sampled_data)
    
    return sampled_data

print(f"\n--- Taking samples from each dataset ---")

train_data_raw = get_samples(train_data_raw, 1600)
print(f"Sampled train_dataset size: {len(train_data_raw)}")

test_data_raw = get_samples(test_data_raw, 400)
print(f"Sampled test_dataset size: {len(test_data_raw)}")


--- Taking samples from each dataset ---
Sampled train_dataset size: 1600
Sampled test_dataset size: 400


# Text Cleaning

In [4]:
def normalize_time_in_text(text):
    def time_repl_hhmm_ampm(match):
        try:
            parsed_time = dateparser.parse(match.group(0))
            return parsed_time.strftime('%H:%M') if parsed_time else match.group(0)
        except:
            return match.group(0)
    text = re.sub(r'\b(\d{1,2}:\d{2})\s*(am|pm)\b', time_repl_hhmm_ampm, text, flags=re.IGNORECASE)

    def time_repl_hh_ampm(match):
        try:
            parsed_time = dateparser.parse(match.group(0))
            return parsed_time.strftime('%H:%M') if parsed_time else match.group(0)
        except:
            return match.group(0)
    text = re.sub(r'\b(\d{1,2})\s*(am|pm)\b', time_repl_hh_ampm, text, flags=re.IGNORECASE)
    text = re.sub(r'\bnoon\b', '12:00', text, flags=re.IGNORECASE)
    text = re.sub(r'\bmidday\b', '12:00', text, flags=re.IGNORECASE)
    text = re.sub(r'\bmidnight\b', '00:00', text, flags=re.IGNORECASE)
    return text

# --- Currency Normalization ---
CURRENCY_SYMBOLS_MAP = {
    '£': 'GBP',
    '$': 'USD',
    '€': 'EUR',
    '¥': 'JPY',
    '₹': 'INR',
}
CURRENCY_KEYWORDS_MAP = {
    'pound': 'GBP', 'pounds': 'GBP', 'quid': 'GBP', 'gbp': 'GBP', 'sterling': 'GBP',
    'dollar': 'USD', 'dollars': 'USD', 'buck': 'USD', 'bucks': 'USD', 'usd': 'USD',
    'euro': 'EUR', 'euros': 'EUR', 'eur': 'EUR',
    'yen': 'JPY', 'jpy': 'JPY',
    'rupee': 'INR', 'rupees': 'INR', 'inr': 'INR',
}
ALL_CURRENCY_KEYWORDS_SORTED = sorted(CURRENCY_KEYWORDS_MAP.keys(), key=len, reverse=True)
ALL_CURRENCY_SYMBOLS_SORTED = sorted(CURRENCY_SYMBOLS_MAP.keys(), key=len, reverse=True)
NUMBER_WORDS_FOR_REGEX = (
    r"zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|"
    r"thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|"
    r"thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion"
)
COMPLEX_NUMBER_WORDS_PATTERN = rf"\b((?:{NUMBER_WORDS_FOR_REGEX})(?:\s+(?:and\s+)?(?:{NUMBER_WORDS_FOR_REGEX}))*)\b"

def normalize_currency_in_text(text):
    """
    Normalizes currency expressions in a string.
    Examples: "twenty pounds" -> "20 GBP", "£20" -> "20 GBP"
    """
    def word_num_keyword_replacer(match):
        num_word_str = match.group(1)
        currency_key_str = match.group(2).lower()
        try:
            num_val = w2n.word_to_num(num_word_str)
            currency_code = CURRENCY_KEYWORDS_MAP.get(currency_key_str, CURRENCY_KEYWORDS_MAP.get(currency_key_str.rstrip('s'), currency_key_str.upper()))
            return f"{num_val} {currency_code.upper()}"
        except ValueError:
            return match.group(0)

    currency_keywords_regex_part = "|".join([re.escape(k) for k in ALL_CURRENCY_KEYWORDS_SORTED])
    pattern_word_num_then_keyword = rf"({COMPLEX_NUMBER_WORDS_PATTERN})\s+({currency_keywords_regex_part})\b"
    text = re.sub(pattern_word_num_then_keyword, word_num_keyword_replacer, text, flags=re.IGNORECASE)

    for symbol in ALL_CURRENCY_SYMBOLS_SORTED:
        code = CURRENCY_SYMBOLS_MAP[symbol]
        text = re.sub(rf'{re.escape(symbol)}\s*(\d+\.?\d*)', rf'\1 {code}', text)
        text = re.sub(rf'(\d+\.?\d*)\s*{re.escape(symbol)}', rf'\1 {code}', text)

    for keyword in ALL_CURRENCY_KEYWORDS_SORTED:
        code = CURRENCY_KEYWORDS_MAP[keyword]
        text = re.sub(rf'(\d+\.?\d*)\s+{re.escape(keyword)}\b', rf'\1 {code}', text, flags=re.IGNORECASE)
        text = re.sub(rf'\b{re.escape(keyword)}\s+(\d+\.?\d*)', rf'\1 {code}', text, flags=re.IGNORECASE)

    all_target_codes = set(CURRENCY_SYMBOLS_MAP.values()) | set(CURRENCY_KEYWORDS_MAP.values())
    for code_val in all_target_codes:
        text = re.sub(rf'\b{re.escape(code_val)}\b', code_val.upper(), text, flags=re.IGNORECASE)

    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(\d)([A-Z]{3}\b)', r'\1 \2', text)
    text = re.sub(r'(\b[A-Z]{3})(\d)', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def flatten_dialogue(dialogue_data):
    flattened_turns = []
    turns = dialogue_data.get("log", [])
    for turn_idx, turn in enumerate(turns):
        raw_text = turn.get("text", "")
        normalized_text = normalize_time_in_text(raw_text)
        normalized_text = normalize_currency_in_text(normalized_text)

        is_user_turn = turn_idx % 2 == 0  # User turns are even in 0-based index
        
        turn_data = {
            "turn_id": turn_idx,
            "user_utterance": normalized_text if is_user_turn else "",
            "system_response": "" if is_user_turn else normalized_text,
            "state": turn.get("metadata", {}).get("state", {})
        }
        
        if isinstance(turn_data["state"], dict):
            turn_data["state"] = json.dumps(turn_data["state"])
        
        flattened_turns.append(turn_data)
    return flattened_turns

def normalize_dialogue_acts(dialogue_acts, dialogue_data):
    normalized_acts = {}
    for dialogue_id, turns_data in dialogue_data.items():
        acts = dialogue_acts.get(dialogue_id, {})
        turns = turns_data.get("log", [])
        normalized_turn_acts = {}
        
        for turn_idx in range(len(turns)):
            turn_key = str(turn_idx)
            turn_acts = acts.get(turn_key, {})
            processed_acts = {}
            for act_type, act_values in turn_acts.items():
                if isinstance(act_values, dict):
                    processed_acts[act_type] = [f"{k}:{v}" for k, v in act_values.items()]
                elif isinstance(act_values, list):
                    flat_values = []
                    for item in act_values:
                        if isinstance(item, list):
                            flat_values.extend([str(x) for x in item])
                        else:
                            flat_values.append(str(item))
                    processed_acts[act_type] = flat_values
                else:
                    processed_acts[act_type] = [str(act_values)]
            
            normalized_turn_acts[turn_key] = processed_acts
        
        normalized_acts[dialogue_id] = normalized_turn_acts
    return normalized_acts

def process_data(raw_data_subset, dialogue_acts, ontology):
    """Process a subset of the data (train or test) with normalization and flattening."""
    normalized_dialogue_acts = normalize_dialogue_acts(dialogue_acts, 
                                                     {d["dialogue_id"]: d["dialogue"] for d in raw_data_subset})
    processed_data = []
    
    for item in raw_data_subset:
        dialogue_id = item["dialogue_id"]
        dialogue = item["dialogue"]
        domain = item["domain"]
        domain_ontology = {
            slot: values 
            for slot, values in ontology.items() 
            if slot.startswith(f"{domain}-")
        }
        
        processed_data.append({
            "dialogue_id": dialogue_id,
            "domain": domain,
            "dialogue": flatten_dialogue(dialogue),
            "dialogue_acts": normalized_dialogue_acts[dialogue_id],
            "ontology": domain_ontology
        })
    
    return processed_data

train_data_processed = process_data(train_data_raw, dialogue_acts, ontology)
test_data_processed = process_data(test_data_raw, dialogue_acts, ontology)

train_dataset = Dataset.from_list(train_data_processed)
test_dataset = Dataset.from_list(test_data_processed)

print("First training example:")
print(train_dataset[0])

First training example:
{'dialogue_id': 'MUL0185.json', 'domain': 'restaurant', 'dialogue': [{'state': '{}', 'system_response': '', 'turn_id': 0, 'user_utterance': 'Yes I need a cheap restaurant in the Cambridge area on the north side of town . What do you suggest ?'}, {'state': '{}', 'system_response': 'thank you I will go to royal spice .', 'turn_id': 1, 'user_utterance': ''}, {'state': '{}', 'system_response': '', 'turn_id': 2, 'user_utterance': 'Okay can you book me a table for Saturday at 19:45 for 3 people ? I would also like the reference number for the booking .'}, {'state': '{}', 'system_response': 'I have successfully booked your reservation . Your reference number is 7XYDZ38V . Can I help you with anything else ?', 'turn_id': 3, 'user_utterance': ''}, {'state': '{}', 'system_response': '', 'turn_id': 4, 'user_utterance': 'Yes , I also need a place to stay . I would prefer at least 4 stars and free parking .'}, {'state': '{}', 'system_response': 'We have 19 entries that match

# Display dialogue after clean text

In [5]:
def randomly_check_first_dialogue_turns(split_name, split_data, num_samples=2, num_turns=2):
    print(f"\n--- Randomly Checking {num_samples} Dialogues (First {num_turns} Turns) from {split_name} ---")
    if not split_data:
        print(f"No data in {split_name} split.")
        return
    if not isinstance(split_data, (list, tuple)):
        split_data = list(split_data)
    random_dialogues = random.sample(split_data, min(num_samples, len(split_data)))
    for dialogue in random_dialogues:
        dialogue_id = dialogue["dialogue_id"]
        turns = dialogue["dialogue"]
        print(f"\nDialogue ID: {dialogue_id}")
        print(f"Domain: {dialogue.get('domain', 'unknown')}")
        print(f"First {min(num_turns, len(turns))} turns:")
        for i in range(min(num_turns, len(turns))):
            turn = turns[i]
            print(f"  Turn {turn['turn_id']}:")
            print(f"    User: {turn['user_utterance']}")
            print(f"    System: {turn['system_response']}")

randomly_check_first_dialogue_turns("Train", train_dataset)
randomly_check_first_dialogue_turns("Test", test_dataset)


--- Randomly Checking 2 Dialogues (First 2 Turns) from Train ---

Dialogue ID: SNG0238.json
Domain: hospital
First 2 turns:
  Turn 0:
    User: Could you find me a hospital in town ?
    System: 
  Turn 1:
    User: 
    System: Yes , Addenbrookes Hospital is in your area , would you like me to book you an appointment ?

Dialogue ID: PMUL3693.json
Domain: attraction
First 2 turns:
  Turn 0:
    User: Hello , can you recommend any theatres in the Centre of town , please ?
    System: 
  Turn 1:
    User: 
    System: I really love The Cambridge Corn Exchange located on Wheeler Street . I do n't have admission information , but you can call them at 01223357851 .

--- Randomly Checking 2 Dialogues (First 2 Turns) from Test ---

Dialogue ID: PMUL4431.json
Domain: restaurant
First 2 turns:
  Turn 0:
    User: I am traveling to Cambridge and looking forward to try local restaurants .
    System: 
  Turn 1:
    User: 
    System: We have lots to explore ! I can help you find one , if you 'd 

# Text Normalization

In [6]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = contractions.fix(text)
    return text
def preprocess_ontology(ontology):
    if isinstance(ontology, list):
        return [preprocess_text(item) for item in ontology]
    elif isinstance(ontology, dict):
        return {preprocess_text(k): v for k, v in ontology.items()}
    return ontology
def preprocess_split_lowercase_contraction(dataset):
    processed_dataset = dataset.map(
        lambda example: {
            "dialogue_id": example["dialogue_id"],
            "domain": preprocess_text(example["domain"]),
            "dialogue": [
                {
                    "turn_id": turn["turn_id"],
                    "user_utterance": preprocess_text(turn["user_utterance"]),
                    "system_response": preprocess_text(turn["system_response"]),
                    "state": turn["state"]
                    # Removed "dialogue_acts" since it's not a turn-level field
                }
                for turn in example["dialogue"]
            ],
            "dialogue_acts": example["dialogue_acts"],  # Preserve top-level dialogue_acts
            "ontology": preprocess_ontology(example["ontology"])
        },
        remove_columns=dataset.column_names
    )
    return processed_dataset

train_dataset = preprocess_split_lowercase_contraction(train_dataset)
test_dataset = preprocess_split_lowercase_contraction(test_dataset)

def randomly_check_first_dialogue_turns(split_name, split_data, num_samples=2, num_turns=2):
    print(f"\n--- Randomly Checking {num_samples} Dialogues (First {num_turns} Turns) from {split_name} ---")
    if not split_data:
        print(f"No data in {split_name} split.")
        return
    if not isinstance(split_data, (list, tuple)):
        split_data = list(split_data)
    random_dialogues = random.sample(split_data, min(num_samples, len(split_data)))
    for dialogue in random_dialogues:
        dialogue_id = dialogue["dialogue_id"]
        turns = dialogue["dialogue"]
        print(f"\nDialogue ID: {dialogue_id}")
        print(f"Domain: {dialogue.get('domain', 'unknown')}")
        print(f"First {min(num_turns, len(turns))} turns:")
        for i in range(min(num_turns, len(turns))):
            turn = turns[i]
            print(f"  Turn {turn['turn_id']}:")
            print(f"    User: {turn['user_utterance']}")
            print(f"    System: {turn['system_response']}")

randomly_check_first_dialogue_turns("Train", train_dataset)
randomly_check_first_dialogue_turns("Test", test_dataset)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]


--- Randomly Checking 2 Dialogues (First 2 Turns) from Train ---

Dialogue ID: SNG0462.json
Domain: restaurant
First 2 turns:
  Turn 0:
    User: hi ! can you give me some information on the royal spice restaurant ?
    System: 
  Turn 1:
    User: 
    System: of course ! it 's a cheap indian restaurant in the north at victoria avenue chesteron cb41eh . the phone number is 01733553355 . may i help with anything else ?

Dialogue ID: MUL0244.json
Domain: train
First 2 turns:
  Turn 0:
    User: yes , i would like to book a train that is leaving monday , and is going to cambridge .
    System: 
  Turn 1:
    User: 
    System: there are 202 trains to cambridge on monday . can you tell me your departure station and the time you  would like to travel ?

--- Randomly Checking 2 Dialogues (First 2 Turns) from Test ---

Dialogue ID: WOZ20573.json
Domain: restaurant
First 2 turns:
  Turn 0:
    User: hello . can you help me find the address of an inexpensive restaurant in the south part of to

# Counts number of words in each sentence in each dialogue

In [8]:
def get_longest_sentence_word_length(datasets: Dict[str, List[Dict[str, Any]]]) -> Dict[str, int]:
    longest_sentence_lengths = {}

    for dataset_name, dataset in datasets.items():
        max_words_in_sentence = 0
        
        for dialogue_entry in dataset:
            dialogue_turns = dialogue_entry.get('dialogue', [])
            
            for turn in dialogue_turns:
                user_utterance = turn.get('user_utterance', '').strip()
                system_response = turn.get('system_response', '').strip()

                # Process user utterance
                if user_utterance:
                    # Simple tokenization by splitting on whitespace
                    words = user_utterance.split()
                    max_words_in_sentence = max(max_words_in_sentence, len(words))

                # Process system response
                if system_response:
                    # Simple tokenization by splitting on whitespace
                    words = system_response.split()
                    max_words_in_sentence = max(max_words_in_sentence, len(words))
        
        longest_sentence_lengths[dataset_name] = max_words_in_sentence
        print(f"Longest sentence in {dataset_name}: {max_words_in_sentence} words")

    return longest_sentence_lengths
all_datasets = {
    'train_dataset': train_dataset,
    'test_dataset': test_dataset,
}
longest_lengths = get_longest_sentence_word_length(all_datasets)
print("\nSummary of longest sentence lengths:")
for dataset_name, length in longest_lengths.items():
    print(f"{dataset_name}: {length} words")

Longest sentence in train_dataset: 67 words
Longest sentence in test_dataset: 56 words

Summary of longest sentence lengths:
train_dataset: 67 words
test_dataset: 56 words


# Check the number of turns in dialogue

In [9]:
def find_longest_turn_id(all_datasets):
    max_turn_id = 0

    for dataset_name, dialogues_dataset in all_datasets.items():
        print(f"Checking dataset: {dataset_name}")
        for dialogue in dialogues_dataset:
            try:
                for turn in dialogue['dialogue']:
                    if 'turn_id' in turn:
                        max_turn_id = max(max_turn_id, turn['turn_id'])
            except KeyError as e:
                print(f"Warning: Skipping dialogue in {dataset_name} due to missing key: {e}")
                continue
            except Exception as e:
                print(f"An unexpected error occurred while processing dialogue in {dataset_name}: {e}")
                continue
    return max_turn_id

all_datasets_dummy = {
    'train_dataset': train_dataset,
    'test_dataset': test_dataset
}
longest_turn_id = find_longest_turn_id(all_datasets_dummy)
print(f"\nThe longest turn_id found across all datasets is: {longest_turn_id}")

Checking dataset: train_dataset
Checking dataset: test_dataset

The longest turn_id found across all datasets is: 43


# Count number of unique intents and entities

In [10]:
def count_intents_and_entities(dataset):
    intents = set()
    entities = set()
    
    for example in dataset:
        intents.update(example["ontology"])
        entities.add(example["domain"])
    
    return intents, entities

all_intents = set()
all_entities = set()
for dataset in [train_dataset, test_dataset]:
    intents, entities = count_intents_and_entities(dataset)
    all_intents.update(intents)
    all_entities.update(entities)

print(f"Number of Unique Intents: {len(all_intents)}")
print(f"Intents: {all_intents}")
print(f"Number of Unique Entities: {len(all_entities)}")
print(f"Entities: {all_entities}")

Number of Unique Intents: 31
Intents: {'attraction-name', 'restaurant-book day', 'taxi-leaveat', 'restaurant-name', 'restaurant-book people', 'taxi-destination', 'hotel-stars', 'hotel-area', 'restaurant-food', 'hotel-pricerange', 'hospital-department', 'hotel-name', 'taxi-arriveby', 'restaurant-area', 'train-destination', 'train-arriveby', 'hotel-parking', 'train-day', 'hotel-book stay', 'train-leaveat', 'restaurant-book time', 'train-book people', 'train-departure', 'restaurant-pricerange', 'taxi-departure', 'hotel-book day', 'attraction-type', 'hotel-book people', 'hotel-internet', 'hotel-type', 'attraction-area'}
Number of Unique Entities: 7
Entities: {'restaurant', 'hotel', 'hospital', 'police', 'attraction', 'train', 'taxi'}


# Text representation and split features and target labels

In [11]:
moe_params = {
    "embedding_dim": 128,
    "max_seq_length": 67,
    "turn_id_dim": 43,
    "num_experts": 4,
    "expert_dim": 64,
    "hidden_dim": 32,
    "vocab_size": None,
    "num_entities": None,
    "num_intents": None,
    "num_domains": None,
    "batch_size": 2,
    "shuffle_buffer_size": 5000,
    "w2v_window": 10,
    "w2v_min_count": 3,
    "w2v_sg": 1,
    "w2v_epochs": 20,
    "w2v_negative": 5
}

class DialoguePreprocessor:
    def __init__(self, moe_parameters):
        self.max_seq_length = moe_parameters["max_seq_length"]
        self.embedding_dim = moe_parameters["embedding_dim"]
        self.w2v_window = moe_parameters["w2v_window"]
        self.w2v_min_count = moe_parameters["w2v_min_count"]
        self.w2v_sg = moe_parameters["w2v_sg"]
        self.w2v_epochs = moe_parameters["w2v_epochs"]
        self.w2v_negative = moe_parameters["w2v_negative"]

        self.word_vectors = None
        self.word_to_id = {}
        self.id_to_word = {}
        self.domain_encoder = LabelEncoder()
        self.vocab_size = 0
        self.num_domains = 0
        self.num_intents = 0
        self.slot_to_id = {}
        self.id_to_slot = {}
        self.slot_values = {}
        self.unique_domains = []
        self.unique_slots = []
        self.PAD_TOKEN = '<pad>'
        self.SOS_TOKEN = '<sos>'
        self.EOS_TOKEN = '<eos>'
        self.UNK_TOKEN = '<unk>'
        self.is_fitted = False

    def fit(self, dialogues_dataset):
        all_words = set()
        all_domains = []
        all_slots = set()
        tokenized_sentences_for_w2v = []

        # Initialize basic vocabulary
        self.word_to_id = {
            self.PAD_TOKEN: 0,
            self.SOS_TOKEN: 1,
            self.EOS_TOKEN: 2,
            self.UNK_TOKEN: 3
        }
        current_id = 4

        if not dialogues_dataset:
            print("Warning: dialogues_dataset is empty. Initializing with minimal vocabulary.")
        else:
            print("Collecting vocabulary, domains, and slots from dataset...")
            for dialogue in dialogues_dataset:
                try:
                    if 'domain' not in dialogue:
                        print(f"Skipping dialogue {dialogue.get('dialogue_id', 'N/A')}: Missing 'domain' key.")
                        continue
                    all_domains.append(dialogue['domain'])

                    if 'ontology' not in dialogue:
                        print(f"Skipping dialogue {dialogue.get('dialogue_id', 'N/A')}: Missing 'ontology' key.")
                        continue
                    ontology = dialogue['ontology']
                    for slot, values in ontology.items():
                        if values:
                            all_slots.add(slot)
                            self.slot_values[slot] = set(values) if isinstance(values, list) else set()

                    if 'dialogue' not in dialogue:
                        print(f"Skipping dialogue {dialogue.get('dialogue_id', 'N/A')}: Missing 'dialogue' key.")
                        continue
                    for turn in dialogue['dialogue']:
                        if 'user_utterance' in turn and turn['user_utterance']:
                            tokens = word_tokenize(turn['user_utterance'].lower())
                            all_words.update(tokens)
                            tokenized_sentences_for_w2v.append(tokens)
                        if 'system_response' in turn and turn['system_response']:
                            tokens = word_tokenize(turn['system_response'].lower())
                            all_words.update(tokens)
                            tokenized_sentences_for_w2v.append(tokens)
                except KeyError as e:
                    print(f"Skipping dialogue due to missing key during fit: {e}")
                    continue

            all_words_list = sorted(list(all_words))
            for word in all_words_list:
                if word not in self.word_to_id:
                    self.word_to_id[word] = current_id
                    current_id += 1

        self.id_to_word = {v: k for k, v in self.word_to_id.items()}
        self.vocab_size = len(self.word_to_id)

        if tokenized_sentences_for_w2v:
            print("Fitting Word2Vec model...")
            self.word_vectors = Word2Vec(
                sentences=tokenized_sentences_for_w2v,
                vector_size=self.embedding_dim,
                window=self.w2v_window,
                min_count=self.w2v_min_count,
                workers=4,
                sg=self.w2v_sg,
                epochs=self.w2v_epochs,
                negative=self.w2v_negative
            )
        else:
            print("Warning: No tokenized sentences for Word2Vec. Word vectors will not be initialized.")

        self.unique_domains = sorted(list(set(all_domains)))
        if self.unique_domains:
            self.domain_encoder.fit(self.unique_domains)
            self.num_domains = len(self.domain_encoder.classes_)
            self.unique_domains = self.domain_encoder.classes_.tolist()
        else:
            print("Warning: No domains found in the dataset.")
            self.num_domains = 0
            self.domain_encoder.fit(['dummy'])
        self.unique_slots = sorted(list(all_slots))
        if self.unique_slots:
            self.slot_to_id = {slot: idx for idx, slot in enumerate(self.unique_slots)}
            self.id_to_slot = {idx: slot for slot, idx in self.slot_to_id.items()}
            self.num_intents = len(self.slot_to_id)
        else:
            print("Warning: No slots found in the dataset.")
            self.num_intents = 0

        self.is_fitted = True
        print(f"Preprocessor fitted. Vocab size: {self.vocab_size}, Num domains: {self.num_domains}, Num intents (slots): {self.num_intents}")

    def _text_to_token_ids(self, text, add_sos_eos=False):
        tokens = word_tokenize(text.lower())
        ids = []
        if add_sos_eos:
            ids.append(self.word_to_id[self.SOS_TOKEN])
        for token in tokens:
            ids.append(self.word_to_id.get(token, self.word_to_id[self.UNK_TOKEN]))
        if add_sos_eos:
            ids.append(self.word_to_id[self.EOS_TOKEN])
        if len(ids) < self.max_seq_length:
            padding = [self.word_to_id[self.PAD_TOKEN]] * (self.max_seq_length - len(ids))
            ids.extend(padding)
        else:
            ids = ids[:self.max_seq_length]
        return np.array(ids, dtype=np.int32), tokens

    def _get_turn_id_embedding(self, turn_id, turn_id_dim):
        embedding = np.zeros(turn_id_dim, dtype=np.float32)
        if turn_id_dim > 0:
            embedding[0] = float(turn_id) / 40.0
        return embedding

    def _extract_entities_and_intents(self, utterance, tokens, ontology):
        active_slots = np.zeros(self.num_intents, dtype=np.float32)
        entities = []

        for slot, values in ontology.items():
            if not values:
                continue
            slot_id = self.slot_to_id.get(slot, -1)
            if slot_id == -1:
                continue

            for value in values:
                value_tokens = word_tokenize(value.lower())
                value_len = len(value_tokens)
                for i in range(len(tokens) - value_len + 1):
                    if tokens[i:i + value_len] == value_tokens:
                        active_slots[slot_id] = 1.0
                        entities.append((i, i + value_len - 1, slot, value))
                        break

        return active_slots, entities

    def preprocess(self, dialogues_dataset):
        if not self.is_fitted:
            raise ValueError("Preprocessor has not been fitted. Call .fit() first.")

        processed_turns = []
        for dialogue in dialogues_dataset:
            try:
                dialogue_domain_label = self.domain_encoder.transform([dialogue['domain']])[0]
                dialogue_domain_onehot_input = tf.keras.utils.to_categorical(
                    dialogue_domain_label, num_classes=self.num_domains
                ).astype(np.float32)

                prev_system_response_tokens, _ = self._text_to_token_ids(self.SOS_TOKEN, add_sos_eos=False)

                for i in range(0, len(dialogue['dialogue']), 2):
                    user_turn = dialogue['dialogue'][i]
                    if 'user_utterance' not in user_turn or not user_turn['user_utterance']:
                        print(f"Skipping user turn {user_turn.get('turn_id', 'N/A')} in dialogue {dialogue.get('dialogue_id', 'N/A')} due to missing or empty 'user_utterance'.")
                        continue

                    user_utterance_tokens, tokens = self._text_to_token_ids(user_turn['user_utterance'])
                    turn_id_embedding = self._get_turn_id_embedding(user_turn['turn_id'], moe_params["turn_id_dim"])

                    active_slots, entities = self._extract_entities_and_intents(user_turn['user_utterance'], tokens, dialogue['ontology'])

                    next_system_response_text = ""
                    if (i + 1) < len(dialogue['dialogue']):
                        next_system_turn = dialogue['dialogue'][i + 1]
                        if 'system_response' in next_system_turn and next_system_turn['system_response']:
                            next_system_response_text = next_system_turn['system_response']

                    full_next_system_response_tokens, _ = self._text_to_token_ids(next_system_response_text, add_sos_eos=True)
                    decoder_input_tokens_for_generation = np.copy(full_next_system_response_tokens)
                    decoder_input_tokens_for_generation = np.roll(decoder_input_tokens_for_generation, shift=1)
                    decoder_input_tokens_for_generation[0] = self.word_to_id[self.SOS_TOKEN]
                    response_generation_target_tokens = np.copy(full_next_system_response_tokens)

                    features = {
                        'user_utterance_tokens': user_utterance_tokens,
                        'prev_system_response_tokens': prev_system_response_tokens,
                        'decoder_input_tokens': decoder_input_tokens_for_generation,
                        'domain_onehot_input': dialogue_domain_onehot_input,
                        'turn_id_embedding': turn_id_embedding,
                        'ontology_multihot_input': active_slots,
                    }

                    targets = {
                        'domain_classification_output': np.array([dialogue_domain_label], dtype=np.int32),
                        'intent_classification_output': active_slots,
                        'response_generation_output': response_generation_target_tokens,
                        'entities_output': entities
                    }

                    raw_data = {
                        'raw_next_system_response': next_system_response_text,
                        'dialogue_id': dialogue['dialogue_id'],
                        'turn_id': user_turn['turn_id'],
                        'raw_user_utterance': user_turn['user_utterance'],
                        'raw_domain': dialogue['domain'],
                        'raw_entities': entities
                    }

                    processed_turns.append({
                        'features': features,
                        'targets': targets,
                        'raw_data': raw_data
                    })

                    prev_system_response_tokens, _ = self._text_to_token_ids(next_system_response_text, add_sos_eos=False)

            except KeyError as e:
                print(f"Warning: Skipping dialogue {dialogue.get('dialogue_id', 'N/A')} due to missing key: {e}")
                continue
            except ValueError as e:
                print(f"Warning: Data error in dialogue {dialogue.get('dialogue_id', 'N/A')}: {e}")
                continue

        return processed_turns

    def save(self, path):
        os.makedirs(path, exist_ok=True)
        with open(os.path.join(path, "domain_encoder.pkl"), "wb") as f:
            pickle.dump(self.domain_encoder, f)
        if self.word_vectors:
            self.word_vectors.wv.save(os.path.join(path, "word_vectors.kv"))
        params_to_save = {
            'max_seq_length': self.max_seq_length,
            'embedding_dim': self.embedding_dim,
            'vocab_size': self.vocab_size,
            'num_domains': self.num_domains,
            'num_intents': self.num_intents,
            'unique_domains': self.unique_domains,
            'unique_slots': self.unique_slots,
            'word_to_id': {str(k): v for k, v in self.word_to_id.items()},
            'id_to_word': {str(k): v for k, v in self.id_to_word.items()},
            'slot_to_id': {str(k): v for k, v in self.slot_to_id.items()},
            'id_to_slot': {str(k): v for k, v in self.id_to_slot.items()},
            'slot_values': {k: list(v) for k, v in self.slot_values.items()},
            'PAD_TOKEN': self.PAD_TOKEN,
            'SOS_TOKEN': self.SOS_TOKEN,
            'EOS_TOKEN': self.EOS_TOKEN,
            'UNK_TOKEN': self.UNK_TOKEN,
            'w2v_window': self.w2v_window,
            'w2v_min_count': self.w2v_min_count,
            'w2v_sg': self.w2v_sg,
            'w2v_epochs': self.w2v_epochs,
            'w2v_negative': self.w2v_negative,
            'is_fitted': self.is_fitted
        }
        with open(os.path.join(path, "preprocessor_params.json"), "w") as f:
            json.dump(params_to_save, f)
        print(f"Preprocessor saved to {path}")

    @classmethod
    def load(cls, path):
        with open(os.path.join(path, "preprocessor_params.json"), "r") as f:
            params = json.load(f)
        dummy_moe_params = {
            "max_seq_length": params['max_seq_length'],
            "embedding_dim": params['embedding_dim'],
            "turn_id_dim": moe_params["turn_id_dim"],
            "w2v_window": params['w2v_window'],
            "w2v_min_count": params['w2v_min_count'],
            "w2v_sg": params['w2v_sg'],
            "w2v_epochs": params['w2v_epochs'],
            "w2v_negative": params['w2v_negative']
        }
        preprocessor = cls(dummy_moe_params)
        with open(os.path.join(path, "domain_encoder.pkl"), "rb") as f:
            preprocessor.domain_encoder = pickle.load(f)
        if os.path.exists(os.path.join(path, "word_vectors.kv")):
            preprocessor.word_vectors = KeyedVectors.load(os.path.join(path, "word_vectors.kv"))
        else:
            print(f"Warning: word_vectors.kv not found at {os.path.join(path, 'word_vectors.kv')}")
            preprocessor.word_vectors = None
        preprocessor.vocab_size = params['vocab_size']
        preprocessor.num_domains = params['num_domains']
        preprocessor.num_intents = params['num_intents']
        preprocessor.unique_domains = params['unique_domains']
        preprocessor.unique_slots = params['unique_slots']
        preprocessor.word_to_id = {str(k): v for k, v in params['word_to_id'].items()}
        preprocessor.id_to_word = {int(k): v for k, v in params['id_to_word'].items()}
        preprocessor.slot_to_id = {str(k): v for k, v in params['slot_to_id'].items()}
        preprocessor.id_to_slot = {int(k): v for k, v in params['id_to_slot'].items()}
        preprocessor.slot_values = {k: set(v) for k, v in params['slot_values'].items()}
        preprocessor.PAD_TOKEN = params['PAD_TOKEN']
        preprocessor.SOS_TOKEN = params['SOS_TOKEN']
        preprocessor.EOS_TOKEN = params['EOS_TOKEN']
        preprocessor.UNK_TOKEN = params['UNK_TOKEN']
        preprocessor.is_fitted = params.get('is_fitted', False)
        return preprocessor

def create_and_save_tf_dataset(processed_data, batch_size, shuffle_buffer_size, dataset_name, save_path, moe_params_for_spec):
    if not processed_data:
        print(f"Warning: No processed data for {dataset_name}. Returning an empty dataset.")
        element_spec = (
            {
                'user_utterance_tokens': tf.TensorSpec(shape=(moe_params_for_spec["max_seq_length"],), dtype=tf.int32),
                'prev_system_response_tokens': tf.TensorSpec(shape=(moe_params_for_spec["max_seq_length"],), dtype=tf.int32),
                'decoder_input_tokens': tf.TensorSpec(shape=(moe_params_for_spec["max_seq_length"],), dtype=tf.int32),
                'domain_onehot_input': tf.TensorSpec(shape=(moe_params_for_spec["num_domains"],), dtype=tf.float32),
                'turn_id_embedding': tf.TensorSpec(shape=(moe_params_for_spec["turn_id_dim"],), dtype=tf.float32),
                'ontology_multihot_input': tf.TensorSpec(shape=(moe_params_for_spec["num_intents"],), dtype=tf.float32),
            },
            {
                'domain_classification_output': tf.TensorSpec(shape=(1,), dtype=tf.int32),
                'intent_classification_output': tf.TensorSpec(shape=(moe_params_for_spec["num_intents"],), dtype=tf.float32),
                'response_generation_output': tf.TensorSpec(shape=(moe_params_for_spec["max_seq_length"],), dtype=tf.int32),
            }
        )
        return tf.data.Dataset.from_generator(lambda: [], output_signature=element_spec), []

    features_list = defaultdict(list)
    targets_list = defaultdict(list)
    raw_data_list = []

    for item in processed_data:
        for key, value in item['features'].items():
            features_list[key].append(value)
        for key, value in item['targets'].items():
            if key != 'entities_output':
                targets_list[key].append(value)
        raw_data_list.append(item['raw_data'])

    features_np = {k: np.array(v) for k, v in features_list.items()}
    targets_np = {k: np.array(v) for k, v in targets_list.items()}

    dataset = tf.data.Dataset.from_tensor_slices((features_np, targets_np))

    if shuffle_buffer_size:
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    dataset_dir = os.path.join(save_path, dataset_name)
    tf.data.Dataset.save(dataset, dataset_dir)
    print(f"TensorFlow Dataset for {dataset_name} saved to {dataset_dir}")

    with open(os.path.join(save_path, f'{dataset_name}_raw_data.pkl'), 'wb') as f:
        pickle.dump(raw_data_list, f)
    print(f"Raw data for {dataset_name} saved to {os.path.join(save_path, f'{dataset_name}_raw_data.pkl')}")

    return dataset, raw_data_list

def load_tf_datasets_from_disk(load_path):
    print(f"\nLoading TensorFlow Datasets and MoE parameters from {load_path}...")
    with open(os.path.join(load_path, "moe_params.json"), "r") as f:
        loaded_moe_params = json.load(f)

    element_spec = (
        {
            'user_utterance_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"],), dtype=tf.int32),
            'prev_system_response_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"],), dtype=tf.int32),
            'decoder_input_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"],), dtype=tf.int32),
            'domain_onehot_input': tf.TensorSpec(shape=(None, loaded_moe_params["num_domains"],), dtype=tf.float32),
            'turn_id_embedding': tf.TensorSpec(shape=(None, loaded_moe_params["turn_id_dim"],), dtype=tf.float32),
            'ontology_multihot_input': tf.TensorSpec(shape=(None, loaded_moe_params["num_intents"],), dtype=tf.float32),
        },
        {
            'domain_classification_output': tf.TensorSpec(shape=(None, 1,), dtype=tf.int32),
            'intent_classification_output': tf.TensorSpec(shape=(None, loaded_moe_params["num_intents"],), dtype=tf.float32),
            'response_generation_output': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"],), dtype=tf.int32)
        }
    )

    datasets = {}
    for dataset_name in ["train", "test", "calibration_train", "calibration_val"]:
        try:
            datasets[dataset_name] = tf.data.Dataset.load(os.path.join(load_path, dataset_name), element_spec=element_spec)
            print(f"Loaded {dataset_name} dataset from {os.path.join(load_path, dataset_name)}")
        except Exception as e:
            print(f"Warning: Failed to load {dataset_name} dataset: {e}")
            datasets[dataset_name] = tf.data.Dataset.from_generator(lambda: [], output_signature=element_spec)

    raw_data_paths = {
        "train": os.path.join(load_path, 'train_raw_data.pkl'),
        "test": os.path.join(load_path, 'test_raw_data.pkl'),
        "calibration_train": os.path.join(load_path, 'calibration_train_raw_data.pkl'),
        "calibration_val": os.path.join(load_path, 'calibration_val_raw_data.pkl')
    }

    raw_data = {}
    for key, path in raw_data_paths.items():
        if os.path.exists(path):
            with open(path, 'rb') as f:
                raw_data[key] = pickle.load(f)
            print(f"Loaded raw data for {key} from {path}")
        else:
            print(f"Warning: Raw data file {path} not found.")
            raw_data[key] = []

    print("TensorFlow Datasets and raw data loaded successfully.")
    return {
        "train_dataset": datasets["train"],
        "test_dataset": datasets["test"],
        "calibration_train_dataset": datasets["calibration_train"],
        "calibration_val_dataset": datasets["calibration_val"],
        "moe_params": loaded_moe_params,
        "raw_data": raw_data
    }

print("\n--- Saving Preprocessed Datasets and Preprocessor Models ---")

preprocessor = DialoguePreprocessor(moe_params)
preprocessor.fit(train_dataset)

moe_params["vocab_size"] = preprocessor.vocab_size
moe_params["num_domains"] = preprocessor.num_domains
moe_params["num_intents"] = preprocessor.num_intents
moe_params["num_entities"] = preprocessor.num_domains

processed_train_data = preprocessor.preprocess(train_dataset)
processed_test_data = preprocessor.preprocess(test_dataset)

tf_dataset_save_path = "tf_datasets"
os.makedirs(tf_dataset_save_path, exist_ok=True)

train_tf_dataset, train_raw_data = create_and_save_tf_dataset(
    processed_train_data, moe_params["batch_size"], moe_params["shuffle_buffer_size"],
    "train", tf_dataset_save_path, moe_params
)
test_tf_dataset, test_raw_data = create_and_save_tf_dataset(
    processed_test_data, moe_params["batch_size"], None,
    "test", tf_dataset_save_path, moe_params
)

with open(os.path.join(tf_dataset_save_path, "moe_params.json"), "w") as f:
    json.dump(moe_params, f, indent=4)
print(f"MoE parameters saved to {tf_dataset_save_path}/moe_params.json")

preprocessor_save_path = "preprocessor_models"
preprocessor.save(preprocessor_save_path)

print("\n--- Loading Preprocessed Datasets and Preprocessor Models ---")

loaded_data = load_tf_datasets_from_disk(tf_dataset_save_path)

loaded_preprocessor = DialoguePreprocessor.load(preprocessor_save_path)
print(f"Preprocessor loaded from {preprocessor_save_path}")

print("\n--- Verifying Loaded Data ---")
print(f"Loaded MoE Parameters: {json.dumps(loaded_data['moe_params'], indent=4)}")

for features, targets in loaded_data["train_dataset"].take(1):
    print("\nSample from Loaded Train Dataset:")
    print("Features keys:", features.keys())
    print("Targets keys:", targets.keys())
    print("User Utterance Tokens shape:", features['user_utterance_tokens'].shape)
    print("Response Generation Output shape:", targets['response_generation_output'].shape)

print("\nLoaded Preprocessor Stats:")
print(f"Vocab size: {loaded_preprocessor.vocab_size}")
print(f"Num domains: {loaded_preprocessor.num_domains}")
print(f"Num intents (slots): {loaded_preprocessor.num_intents}")
print(f"Sample word_to_id (first 5): {dict(list(loaded_preprocessor.word_to_id.items())[:5])}")

print("\nRaw Data Sample (Train, first item):")
if loaded_data["raw_data"]["train"]:
    print(loaded_data["raw_data"]["train"][0])
else:
    print("No raw data available for train.")


--- Saving Preprocessed Datasets and Preprocessor Models ---
Collecting vocabulary, domains, and slots from dataset...
Fitting Word2Vec model...
Preprocessor fitted. Vocab size: 6132, Num domains: 7, Num intents (slots): 31


2025-07-04 06:01:26.856091: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-04 06:01:27.163710: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-04 06:01:27.163911: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

TensorFlow Dataset for train saved to tf_datasets/train
Raw data for train saved to tf_datasets/train_raw_data.pkl
TensorFlow Dataset for test saved to tf_datasets/test
Raw data for test saved to tf_datasets/test_raw_data.pkl
MoE parameters saved to tf_datasets/moe_params.json
Preprocessor saved to preprocessor_models

--- Loading Preprocessed Datasets and Preprocessor Models ---

Loading TensorFlow Datasets and MoE parameters from tf_datasets...
Loaded train dataset from tf_datasets/train
Loaded test dataset from tf_datasets/test
Loaded calibration_train dataset from tf_datasets/calibration_train
Loaded calibration_val dataset from tf_datasets/calibration_val
Loaded raw data for train from tf_datasets/train_raw_data.pkl
Loaded raw data for test from tf_datasets/test_raw_data.pkl
Loaded raw data for calibration_train from tf_datasets/calibration_train_raw_data.pkl
Loaded raw data for calibration_val from tf_datasets/calibration_val_raw_data.pkl
TensorFlow Datasets and raw data loaded s

# Load text representation parameters and preprocessed datasets.

In [12]:
def load_tf_datasets_from_disk(load_path):
    print(f"\nLoading TensorFlow Datasets and MoE parameters from {load_path}...")
    try:
        with open(os.path.join(load_path, "moe_params.json"), "r") as f:
            loaded_moe_params = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"moe_params.json not found in {load_path}")

    element_spec = (
        {
            'user_utterance_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"]), dtype=tf.int32),
            'prev_system_response_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"]), dtype=tf.int32),
            'decoder_input_tokens': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"]), dtype=tf.int32),
            'domain_onehot_input': tf.TensorSpec(shape=(None, loaded_moe_params["num_domains"]), dtype=tf.float32),
            'turn_id_embedding': tf.TensorSpec(shape=(None, loaded_moe_params["turn_id_dim"]), dtype=tf.float32),
            'ontology_multihot_input': tf.TensorSpec(shape=(None, loaded_moe_params["num_intents"]), dtype=tf.float32),
        },
        {
            'domain_output': tf.TensorSpec(shape=(None, 1), dtype=tf.int32),
            'intent_output': tf.TensorSpec(shape=(None, loaded_moe_params["num_intents"]), dtype=tf.float32),
            'response_output': tf.TensorSpec(shape=(None, loaded_moe_params["max_seq_length"]), dtype=tf.int32)
        }
    )

    try:
        train_tf_dataset = tf.data.Dataset.load(os.path.join(load_path, "train"), element_spec=element_spec)
        test_tf_dataset = tf.data.Dataset.load(os.path.join(load_path, "test"), element_spec=element_spec)
    except Exception as e:
        raise ValueError(f"Failed to load datasets from {load_path}: {str(e)}")

    raw_data = {}
    for dataset_name in ['train', 'test']:
        path = os.path.join(load_path, f'{dataset_name}_raw_data.pkl')
        if os.path.exists(path):
            with open(path, 'rb') as f:
                raw_data[dataset_name] = pickle.load(f)
            print(f"Loaded raw data for {dataset_name} from {path}")
        else:
            print(f"Warning: Raw data file {path} not found.")
            raw_data[dataset_name] = []

    return {
        "train_dataset": train_tf_dataset,
        "test_dataset": test_tf_dataset,
        "moe_params": loaded_moe_params,
        "raw_data": raw_data
    }

tf_dataset_save_path = "tf_datasets"
loaded_data = load_tf_datasets_from_disk(tf_dataset_save_path)
train_tf_dataset = loaded_data["train_dataset"]
test_tf_dataset = loaded_data["test_dataset"]
moe_params = loaded_data["moe_params"]
raw_data = loaded_data["raw_data"]


Loading TensorFlow Datasets and MoE parameters from tf_datasets...
Loaded raw data for train from tf_datasets/train_raw_data.pkl
Loaded raw data for test from tf_datasets/test_raw_data.pkl
