In [None]:
import tensorflow as tf
tf.__version__

'2.4.1'

In [1]:
from urllib.request import urlretrieve
from pathlib import Path


!gdown https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz
!tar -xvf /content/amazon-massive-dataset-1.0.tar.gz


Downloading...
From: https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz
To: /content/amazon-massive-dataset-1.0.tar.gz
100% 39.5M/39.5M [00:04<00:00, 8.34MB/s]
1.0/
1.0/CITATION.md
1.0/NOTICE.md
1.0/data/
1.0/data/mn-MN.jsonl
1.0/data/af-ZA.jsonl
1.0/data/el-GR.jsonl
1.0/data/ta-IN.jsonl
1.0/data/ar-SA.jsonl
1.0/data/ur-PK.jsonl
1.0/data/pl-PL.jsonl
1.0/data/ko-KR.jsonl
1.0/data/az-AZ.jsonl
1.0/data/da-DK.jsonl
1.0/data/kn-IN.jsonl
1.0/data/tl-PH.jsonl
1.0/data/is-IS.jsonl
1.0/data/lv-LV.jsonl
1.0/data/it-IT.jsonl
1.0/data/es-ES.jsonl
1.0/data/fr-FR.jsonl
1.0/data/ml-IN.jsonl
1.0/data/km-KH.jsonl
1.0/data/fa-IR.jsonl
1.0/data/sw-KE.jsonl
1.0/data/en-US.jsonl
1.0/data/tr-TR.jsonl
1.0/data/bn-BD.jsonl
1.0/data/he-IL.jsonl
1.0/data/te-IN.jsonl
1.0/data/pt-PT.jsonl
1.0/data/ka-GE.jsonl
1.0/data/ja-JP.jsonl
1.0/data/id-ID.jsonl
1.0/data/ru-RU.jsonl
1.0/data/hy-AM.jsonl
1.0/data/nb-NO.jsonl
1.0/data/ms-MY.jsonl
1.0/data/sq-AL.jsonl
1.0/data/sv-SE.jsonl
1.0/

In [7]:
import json
import os


def parse_line(item, intent_map):
    """
    Parse a single line of data to create token-label pairs and map intent IDs to intent names.
    """
    utt = item['input_text']
    slots = item['slots']
    intent = intent_map[item['intent_id']]  # Map intent_id to intent name

    tokens = utt.split()
    if len(tokens) != len(slots):
        # Log the problematic record and skip it
        print(f"Skipping record due to mismatch: {tokens} vs {slots}")
        return None, None

    # Create token-label pairs
    labeled_tokens = [f"{token}:{slot}" for token, slot in zip(tokens, slots)]
    return ' '.join(labeled_tokens), intent


def read_json_files(data_folder):
    """
    Read all JSON files in the folder and return a list of dictionaries.
    Supports both JSON and JSON Lines formats.
    """
    all_data = []
    for file_name in os.listdir(data_folder):
        if file_name.endswith(".json"):
            file_path = os.path.join(data_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    # Try loading as a JSON file
                    data = json.load(f)
                    if isinstance(data, list):  # If it's a list of dictionaries
                        all_data.extend(data)
                    else:  # If it's a single dictionary
                        all_data.append(data)
                except json.JSONDecodeError:
                    # If JSON decoding fails, assume it's JSON Lines format
                    f.seek(0)  # Reset file pointer
                    for line in f:
                        all_data.append(json.loads(line.strip()))
    return all_data


def create_txt_file(data_folder, intent_map, output_file):
    """
    Create a .txt file (train.txt or validation.txt) for a given dataset folder.
    """
    all_data = read_json_files(data_folder)  # Read all JSON data from the folder

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_data:
            parsed_line, intent = parse_line(item, intent_map)
            if parsed_line and intent:  # Skip invalid records
                f.write(f"{parsed_line} <=> {intent}\n")


def create_vocab_files(train_folder, validation_folder, intent_map, intent_file, slot_file):
    """
    Create intent.txt and slot.txt files based on data from both train and validation folders.
    """
    intents = set()
    slots = set()

    # Process both train and validation folders
    for data_folder in [train_folder, validation_folder]:
        all_data = read_json_files(data_folder)
        for item in all_data:
            if len(item['input_text'].split()) != len(item['slots']):
                # Skip invalid records
                print(f"Skipping record in vocab creation due to mismatch: {item['input_text']} vs {item['slots']}")
                continue

            intents.add(intent_map[item['intent_id']])
            slots.update(item['slots'])

    # Add 'O' to the slot vocabulary
    slots.add('O')

    # Write intent.txt
    with open(intent_file, 'w', encoding='utf-8') as f:
        for intent in sorted(intents):
            f.write(f"{intent}\n")

    # Write slot.txt
    with open(slot_file, 'w', encoding='utf-8') as f:
        for slot in sorted(slots):
            f.write(f"{slot}\n")


if __name__ == "__main__":
    # Define paths
    train_folder = "/home/mh/Desktop/NLU-prj/Data-part1&2-v3/train"
    validation_folder = "/home/mh/Desktop/NLU-prj/Data-part1&2-v3/validation"

    # Output files
    train_output = "train.txt"
    validation_output = "validation.txt"
    intent_output = "intent.txt"
    slot_output = "slot.txt"

    # Intent mapping (based on the list you provided)
    intent_map = {
        10: "open_account_free",
        11: "open_account_current",
        12: "open_account_deposit",
        20: "loan_free",
        21: "loan_interest",
        30: "card2card",
        31: "paya",
        32: "convert_cheque",
        40: "receipt_payment",
        41: "installment_payment",
        50: "turnover_bill",
        51: "balance_bill",
        60: "submit_cheque",
        61: "recieve_cheque",
        70: "change_password",
        71: "duplicate_card",
        72: "close_card",
        80: "delegate_account",
        81: "currency_request",
        90: "software_problem",
        91: "signin_problem",
    }

    # Process the train folder
    print("Processing train folder...")
    create_txt_file(train_folder, intent_map, train_output)

    # Process the validation folder
    print("Processing validation folder...")
    create_txt_file(validation_folder, intent_map, validation_output)

    # Create intent.txt and slot.txt
    print("Creating intent.txt and slot.txt...")
    create_vocab_files(train_folder, validation_folder, intent_map, intent_output, slot_output)

    print("Files created: train.txt, validation.txt, intent.txt, slot.txt")

Processing train folder...
Skipping record due to mismatch: ['همه', 'کارهای', 'مهاجرتم', 'رو', 'انجام', 'دادم', 'مونده', 'فقط', 'گرفتن', 'ارز.', 'به', '3200', 'دلار', 'واسه', 'کشور', 'استرالیا', 'نیاز', 'دارم.', 'برام', 'ردیفش', 'کن', 'لطفاً'] vs ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'b-amount', 'b-currency', 'o', 'b-country', 'o', 'o', 'o', 'o', 'o', 'o']
Processing validation folder...
Creating intent.txt and slot.txt...
Skipping record in vocab creation due to mismatch: همه کارهای مهاجرتم رو انجام دادم مونده فقط گرفتن ارز. به 3200 دلار واسه کشور استرالیا نیاز دارم. برام ردیفش کن لطفاً vs ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'b-amount', 'b-currency', 'o', 'b-country', 'o', 'o', 'o', 'o', 'o', 'o']
Files created: train.txt, validation.txt, intent.txt, slot.txt


## edit based on order ofd files 

In [37]:
import json
import os


def parse_line(item, intent_map):
    """
    Parse a single line of data to create token-label pairs and map intent IDs to intent names.
    """
    utt = item['input_text']
    slots = item['slots']
    intent = intent_map[item['intent_id']]  # Map intent_id to intent name

    tokens = utt.split()
    if len(tokens) != len(slots):
        # Log the problematic record and skip it
        print(f"Skipping record due to mismatch: {tokens} vs {slots}")
        return None, None

    # Create token-label pairs
    labeled_tokens = [f"{token}:{slot}" for token, slot in zip(tokens, slots)]
    return ' '.join(labeled_tokens), intent


def read_json_files(data_folder):
    """
    Read all JSON files in the folder (sorted alphabetically) and return a list of dictionaries.
    Supports both JSON and JSON Lines formats.
    """
    all_data = []
    # Sort files alphabetically
    sorted_files = sorted(os.listdir(data_folder))
    for file_name in sorted_files:
        if file_name.endswith(".json"):
            file_path = os.path.join(data_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    # Try loading as a JSON file
                    data = json.load(f)
                    if isinstance(data, list):  # If it's a list of dictionaries
                        all_data.extend(data)
                    else:  # If it's a single dictionary
                        all_data.append(data)
                except json.JSONDecodeError:
                    # If JSON decoding fails, assume it's JSON Lines format
                    f.seek(0)  # Reset file pointer
                    for line in f:
                        all_data.append(json.loads(line.strip()))
    return all_data


def create_txt_file(data_folder, intent_map, output_file):
    """
    Create a .txt file (train.txt or validation.txt) for a given dataset folder.
    """
    all_data = read_json_files(data_folder)  # Read all JSON data from the folder

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_data:
            parsed_line, intent = parse_line(item, intent_map)
            if parsed_line and intent:  # Skip invalid records
                f.write(f"{parsed_line} <=> {intent}\n")


def create_vocab_files(train_folder, validation_folder, intent_map, intent_file, slot_file):
    """
    Create intent.txt and slot.txt files based on data from both train and validation folders.
    """
    intents = set()
    slots = set()

    # Process both train and validation folders
    for data_folder in [train_folder, validation_folder]:
        all_data = read_json_files(data_folder)
        for item in all_data:
            if len(item['input_text'].split()) != len(item['slots']):
                # Skip invalid records
                print(f"Skipping record in vocab creation due to mismatch: {item['input_text']} vs {item['slots']}")
                continue

            intents.add(intent_map[item['intent_id']])
            slots.update(item['slots'])

    # Add 'O' to the slot vocabulary
    slots.add('O')

    # Write intent.txt
    with open(intent_file, 'w', encoding='utf-8') as f:
        for intent in sorted(intents):
            f.write(f"{intent}\n")

    # Write slot.txt
    with open(slot_file, 'w', encoding='utf-8') as f:
        for slot in sorted(slots):
            f.write(f"{slot}\n")


if __name__ == "__main__":
    # Define paths
    train_folder = "/home/mh/Desktop/NLU-prj/Data-part1&2-v3/train"
    validation_folder = "/home/mh/Desktop/NLU-prj/Data-part1&2-v3/validation"

    # Output files
    train_output = "train.txt"
    validation_output = "validation.txt"
    intent_output = "intent.txt"
    slot_output = "slot.txt"

    # Intent mapping (based on the list you provided)
    intent_map = {
        10: "open_account_free",
        11: "open_account_current",
        12: "open_account_deposit",
        20: "loan_free",
        21: "loan_interest",
        30: "card2card",
        31: "paya",
        32: "convert_cheque",
        40: "receipt_payment",
        41: "installment_payment",
        50: "turnover_bill",
        51: "balance_bill",
        60: "submit_cheque",
        61: "recieve_cheque",
        70: "change_password",
        71: "duplicate_card",
        72: "close_card",
        80: "delegate_account",
        81: "currency_request",
        90: "software_problem",
        91: "signin_problem",
    }

    # Process the train folder
    print("Processing train folder...")
    create_txt_file(train_folder, intent_map, train_output)

    # Process the validation folder
    print("Processing validation folder...")
    create_txt_file(validation_folder, intent_map, validation_output)

    # Create intent.txt and slot.txt
    print("Creating intent.txt and slot.txt...")
    create_vocab_files(train_folder, validation_folder, intent_map, intent_output, slot_output)

    print("Files created: train.txt, validation.txt, intent.txt, slot.txt")

Processing train folder...
Skipping record due to mismatch: ['همه', 'کارهای', 'مهاجرتم', 'رو', 'انجام', 'دادم', 'مونده', 'فقط', 'گرفتن', 'ارز.', 'به', '3200', 'دلار', 'واسه', 'کشور', 'استرالیا', 'نیاز', 'دارم.', 'برام', 'ردیفش', 'کن', 'لطفاً'] vs ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'b-amount', 'b-currency', 'o', 'b-country', 'o', 'o', 'o', 'o', 'o', 'o']
Processing validation folder...
Creating intent.txt and slot.txt...
Skipping record in vocab creation due to mismatch: همه کارهای مهاجرتم رو انجام دادم مونده فقط گرفتن ارز. به 3200 دلار واسه کشور استرالیا نیاز دارم. برام ردیفش کن لطفاً vs ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'b-amount', 'b-currency', 'o', 'b-country', 'o', 'o', 'o', 'o', 'o', 'o']
Files created: train.txt, validation.txt, intent.txt, slot.txt


## normazlizing dataset 

In [38]:
def normalize_digits(text, to_persian=False):
    """
    Normalize digits in the given text.
    Converts Persian digits to English digits by default.
    Set `to_persian=True` to convert English digits to Persian digits.
    """
    if to_persian:
        # Convert English digits to Persian digits
        english_to_persian = str.maketrans("0123456789", "۰۱۲۳۴۵۶۷۸۹")
        return text.translate(english_to_persian)
    else:
        # Convert Persian digits to English digits
        persian_to_english = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
        return text.translate(persian_to_english)


def normalize_txt_file_inplace(file_path, to_persian=False):
    """
    Normalize digits in a .txt file (in-place).
    Converts Persian digits to English digits by default.
    """
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Normalize digits in each line
    normalized_lines = [normalize_digits(line, to_persian) for line in lines]

    # Write the normalized lines back to the same file (in-place)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(normalized_lines)

    print(f"Normalized digits in {file_path} (in-place).")


if __name__ == "__main__":
    # Define paths to the .txt files
    train_file = "train.txt"
    validation_file = "validation.txt"

    # Normalize digits in the .txt files (convert Persian digits to English)
    print("Normalizing digits in .txt files...")
    normalize_txt_file_inplace(train_file, to_persian=False)
    normalize_txt_file_inplace(validation_file, to_persian=False)

Normalizing digits in .txt files...
Normalized digits in train.txt (in-place).
Normalized digits in validation.txt (in-place).
