In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fuladata/fulbe_adlam.txt
/kaggle/input/fuladata/Sustainability.txt
/kaggle/input/fuladata/binndi_adlam.txt
/kaggle/input/fuladata/peace_adlam.txt
/kaggle/input/fuladata/Sustainability_adlam.txt
/kaggle/input/fuladata/peace.txt
/kaggle/input/fuladata/pulaar_adlam.txt
/kaggle/input/fuladata/tech_adlam.txt
/kaggle/input/fuladata/teddungal_latin.txt
/kaggle/input/fuladata/teddungal_adlam.txt
/kaggle/input/fuladata/fulbe.txt
/kaggle/input/fuladata/tech.txt
/kaggle/input/fuladata/binndi_latin.txt
/kaggle/input/fuladata/pulaar_latin.txt


In [26]:
rm -rf /kaggle/working/adlam_texts_fixed


In [36]:
import os
import re # Keep re imported, might be useful for future cleaning steps

# --- Configuration ---
# Path to your folder with original .txt files
input_folder = "/kaggle/input/fuladata"
# Path where processed individual files will be saved
output_folder = "/kaggle/working/adlam"
# Name for the final combined file
combined_filename = "combined_adlam_output.txt"
# Path for the final combined file
combined_output_path = os.path.join(output_folder, combined_filename)

# --- RTL Marker ---
# Unicode marker for Right-to-Left scripts (important for proper display)
rtl_marker = "\u200F"

# --- Helper Functions ---

def is_adlam_file(filename):
    """Check if a file is an Adlam text file based on its suffix."""
    return filename.endswith("_adlam.txt")

def clean_and_format_rtl_text(lines):
    """
    Cleans lines, combines them into paragraphs, and formats for RTL display.

    Args:
        lines (list): A list of strings, where each string is a line from the input file.

    Returns:
        list: A list of strings, where each string is a processed paragraph
              prepended with the RTL marker and ending with a newline.
    """
    cleaned_paragraphs = []
    current_paragraph = ""
    for line in lines:
        stripped_line = line.strip()
        if stripped_line:
            # If the line has content, add it to the current paragraph
            # Add a space separator if the paragraph already has content
            if current_paragraph:
                current_paragraph += " " + stripped_line
            else:
                current_paragraph = stripped_line
        else:
            # If the line is empty and we have content in current_paragraph,
            # it marks the end of a paragraph.
            if current_paragraph:
                # Add RTL marker at the beginning and a newline at the end
                cleaned_paragraphs.append(rtl_marker + current_paragraph + "\n")
                current_paragraph = "" # Reset for the next paragraph

    # Add the last paragraph if the file doesn't end with a blank line
    if current_paragraph:
        cleaned_paragraphs.append(rtl_marker + current_paragraph + "\n")

    return cleaned_paragraphs

def process_adlam_file(input_path, output_path):
    """
    Reads an Adlam file, cleans/formats it for RTL, and saves the result.

    Args:
        input_path (str): Path to the input Adlam text file.
        output_path (str): Path where the processed file should be saved.
    """
    try:
        with open(input_path, "r", encoding="utf-8") as f_in:
            lines = f_in.readlines()

        processed_paragraphs = clean_and_format_rtl_text(lines)

        # Ensure the output directory exists (though created earlier, good practice)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(output_path, "w", encoding="utf-8") as f_out:
            f_out.writelines(processed_paragraphs)

        print(f"✅ Processed and saved RTL formatted: {os.path.basename(output_path)}")
        return True # Indicate success

    except FileNotFoundError:
        print(f"❌ Error: Input file not found: {input_path}")
        return False
    except Exception as e:
        print(f"❌ Error processing file {input_path}: {e}")
        return False

# --- Main Processing Logic ---

# 1. Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
print(f"Output directory set to: {output_folder}")

processed_files_list = []

# 2. Process each Adlam file individually
print("\n--- Processing Individual Files ---")
if not os.path.exists(input_folder):
     print(f"❌ Error: Input folder not found: {input_folder}")
else:
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        if os.path.isfile(input_path) and is_adlam_file(filename):
            output_path = os.path.join(output_folder, filename)
            if process_adlam_file(input_path, output_path):
                 processed_files_list.append(output_path) # Keep track of successfully processed files
        elif os.path.isfile(input_path):
            print(f"⏩ Skipped (not an Adlam file): {filename}")
        # Optional: handle directories if needed, otherwise ignore them

# 3. Combine processed files into a single file
print("\n--- Combining Processed Files ---")
if not processed_files_list:
    print("❓ No Adlam files were processed. Cannot create combined file.")
else:
    try:
        with open(combined_output_path, "w", encoding="utf-8") as f_combined:
            print(f"📝 Creating combined file: {combined_output_path}")
            for file_path in processed_files_list:
                try:
                    with open(file_path, "r", encoding="utf-8") as f_processed:
                        content = f_processed.read()
                        f_combined.write(content)
                        # Add an extra newline between content of different files
                        # This helps separate the text from different original files
                        f_combined.write("\n")
                    print(f"  + Added content from: {os.path.basename(file_path)}")
                except Exception as e:
                     print(f"  ❌ Error reading processed file {os.path.basename(file_path)}: {e}. Skipping this file.")

        print(f"🎉 Successfully combined {len(processed_files_list)} processed files into: {combined_filename}")

    except Exception as e:
        print(f"❌ Error creating combined file {combined_output_path}: {e}")

print("\n--- Script Finished ---")

Output directory set to: /kaggle/working/adlam

--- Processing Individual Files ---
✅ Processed and saved RTL formatted: fulbe_adlam.txt
⏩ Skipped (not an Adlam file): Sustainability.txt
✅ Processed and saved RTL formatted: binndi_adlam.txt
✅ Processed and saved RTL formatted: peace_adlam.txt
✅ Processed and saved RTL formatted: Sustainability_adlam.txt
⏩ Skipped (not an Adlam file): peace.txt
✅ Processed and saved RTL formatted: pulaar_adlam.txt
✅ Processed and saved RTL formatted: tech_adlam.txt
⏩ Skipped (not an Adlam file): teddungal_latin.txt
✅ Processed and saved RTL formatted: teddungal_adlam.txt
⏩ Skipped (not an Adlam file): fulbe.txt
⏩ Skipped (not an Adlam file): tech.txt
⏩ Skipped (not an Adlam file): binndi_latin.txt
⏩ Skipped (not an Adlam file): pulaar_latin.txt

--- Combining Processed Files ---
📝 Creating combined file: /kaggle/working/adlam/combined_adlam_output.txt
  + Added content from: fulbe_adlam.txt
  + Added content from: binndi_adlam.txt
  + Added content from

In [43]:

with open("/kaggle/working/adlam/combined_adlam_output.txt", "r", encoding="utf-8") as f:
            adlam = f.read()


In [61]:
! pip install fire

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting fire
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=9146311f96ca21a2534a06b5688ddf4fdc558098604a58f7681b3fe7fd2b7d24
  Stored in directory: /root/.cache/pip/wheels/46/54/24/1624fd5b8674eb1188623f7e8e17cdf7c0f6c24b609dfb8a89
Successfully built fire
Installing collected packages: fire
Successfully installed fire-0.7.0


In [68]:
# adlam_tokenizer_training.py

import fire
import os
import sentencepiece as spm

def train_tokenizer(
    data_file: str,
    save_path: str,
    vocab_size: int = 16000,
    num_threads: int = 8
):
    """
    Train a SentencePiece BPE tokenizer on Adlam Fula data.

    Args:
        data_file (str): Path to the combined text file.
        save_path (str): Directory where the tokenizer model will be saved.
        vocab_size (int, optional): Size of the tokenizer vocabulary. Defaults to 16000.
        num_threads (int, optional): Number of CPU threads to use. Defaults to 8.
    """
    os.makedirs(save_path, exist_ok=True)
    tokenizer_name = os.path.join(save_path, "tokenizer")

    spm.SentencePieceTrainer.train(
        input=data_file,
        model_prefix=tokenizer_name,
        vocab_size=vocab_size,
        num_threads=num_threads,
        model_type="bpe",
        max_sentence_length=1073741824,
        shuffle_input_sentence=True,
        character_coverage=1.0,
        hard_vocab_limit=False,
    )

 


In [69]:
data_file = "/kaggle/working/adlam/combined_adlam_output.txt"
output = "/kaggle/working/adlam_tokenizer"
train_tokenizer(data_file,output)

In [86]:
from transformers import PreTrainedTokenizerFast

def load_adlam_tokenizer():
    """
    Load the trained Adlam tokenizer from the specified path.

    Args:
        tokenizer_path (str): Path to the SentencePiece `.model` file (without extension if using model prefix).
    Returns:
        PreTrainedTokenizerFast: Hugging Face-compatible tokenizer
    """
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_file="/kaggle/working/adlam_fula_tokenizer/tokenizer.json",  # SentencePiece model
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
    )

    print("✅ Tokenizer loaded successfully!")
    return tokenizer


In [92]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/kaggle/working/adlam_fula_tokenizer/tokenizer.json")


In [95]:
# 🔍 Test a sample Adlam sentence
#sample = "𞤲𞥋𞤶𞤫𞤤 𞤢𞤧𞤼𞤫 𞤩𞤮 𞤲𞤢𞥄𞤪𞤢𞤤"
print("Tokens:", tokenizer.tokenize(sample))

Tokens: ['𞤉', '𞤺𞤭𞤤𞤢', '𞤲', "'", '𞤣𞤫𞤲', '[UNK]', '𞤣𞤢𞤪𞤼𞤮𞤤', '𞤧𞤵𞤳𞤢', '𞤣𞤫𞤦𞤮', '𞤮', '𞤫', '𞤲𞤢𞤺𞤫', '𞤥𞤵𞤥', '𞤤𞤫𞤱𞤪𞤵', '𞤭𞤲𞤢', '𞤴𞤢𞤸𞤪𞤢', '𞤫', '𞤣𞤵𞤩𞤭', '𞥑𞥘', '[UNK]', '𞤭𞤲𞤢', '𞤲', "'", '𞤶𞤵𞤩𞤭𞤲𞤫', '𞤫', '𞤶𞤭𞤥𞤯𞤭', '𞤬𞤵𞤤𞤩𞤫', '𞤶𞤭𞤥𞤯𞤭', '𞤤𞤫𞤧', '𞤢𞤧𞤢𞤥𞤢𞤲', '𞤳𞤮𞤣𞤫', '.']


In [94]:
# Example usage
adlam_tokenizer = load_adlam_tokenizer()

# 🔍 Test a sample Adlam sentence
sample = "𞤉 𞤺𞤭𞤤𞤢 𞤲'𞤣𞤫𞥅𞤲, 𞤣𞤢𞥄𞤪𞤼𞤮𞤤 𞤧𞤵𞤳𞤢 𞤣𞤫𞤦𞥆𞤮 𞤮𞥅 𞤫 𞤲𞤢𞤺𞥆𞤫 𞤥𞤵𞤥 𞤤𞤫𞤱𞤪𞤵 𞤭𞤲𞤢 𞤴𞤢𞤸𞤪𞤢 𞤫 𞤣𞤵𞥅𞤩𞤭 𞥑𞥘, 𞤭𞤲𞤢 𞤲'𞤶𞤵𞤩𞥆𞤭𞤲𞤫𞥅 𞤫 𞤶𞤭𞤥𞤯𞤭 𞤬𞤵𞤤𞤩𞤫 𞤶𞤭𞤥𞤯𞤭 𞤤𞤫𞤧 𞤢𞤧𞤢𞤥𞤢𞥄𞤲 𞤳𞤮𞥅𞤣𞤫."
print("Tokens:", adlam_tokenizer.tokenize(sample))

✅ Tokenizer loaded successfully!
Tokens: ['𞤉', '𞤺𞤭𞤤𞤢', '𞤲', "'", '𞤣𞤫𞤲', '[UNK]', '𞤣𞤢𞤪𞤼𞤮𞤤', '𞤧𞤵𞤳𞤢', '𞤣𞤫𞤦𞤮', '𞤮', '𞤫', '𞤲𞤢𞤺𞤫', '𞤥𞤵𞤥', '𞤤𞤫𞤱𞤪𞤵', '𞤭𞤲𞤢', '𞤴𞤢𞤸𞤪𞤢', '𞤫', '𞤣𞤵𞤩𞤭', '𞥑𞥘', '[UNK]', '𞤭𞤲𞤢', '𞤲', "'", '𞤶𞤵𞤩𞤭𞤲𞤫', '𞤫', '𞤶𞤭𞤥𞤯𞤭', '𞤬𞤵𞤤𞤩𞤫', '𞤶𞤭𞤥𞤯𞤭', '𞤤𞤫𞤧', '𞤢𞤧𞤢𞤥𞤢𞤲', '𞤳𞤮𞤣𞤫', '.']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/adlam_tokenizer")


In [97]:
#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
login(token=" ")

In [118]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# 1. Load LLaMA tokenizer
llama_tokeniser = AutoTokenizer.from_pretrained("cawoylel/firtanam") 
# Replace with LLaMA model name

# 2. Load Adlam tokenizer (make sure it’s in the Hugging Face-compatible format, such as tokenizer.json)
adlam_tokenizer = PreTrainedTokenizerFast.from_pretrained("/kaggle/working/adlam_fula_tokenizer")

# 3. Check current vocab size
print(f"Current LLaMA tokenizer vocab size: {len(llama_tokenizer)}")
print(f"Current Adlam tokenizer vocab size: {len(adlam_tokenizer)}")


Current LLaMA tokenizer vocab size: 160155
Current Adlam tokenizer vocab size: 32000


In [119]:
# Test with some sample text in Adlam and LLaMA
sample_adlam_text = "𞤢 𞤲'𞤶𞤵𞤩𞥆𞤭𞤲𞤫𞥅 𞤫 𞤶𞤭𞤥𞤯𞤭 𞤬𞤵𞤤𞤩𞤫 𞤶𞤭𞤥𞤯𞤭 𞤤𞤫𞤧 𞤢𞤧𞤢𞤥𞤢𞥄𞤲 𞤳𞤮𞥅𞤣𞤫.𞤑𞤮𞤲𞤮 𞤴𞤮𞤲𞤼𞤢𞥄𞤶𞤭 𞤴𞤮𞥅𞤪𞤮 𞤺𞤮𞥅𞤼𞤮, 𞤥𞤢𞥄𞤴𞤮𞥅𞤶𞤭 𞤯𞤭𞥅 𞤩𞤵𞥅𞤩𞤼𞤭𞥅, 𞤸𞤵𞤯𞤮 𞤩𞤵𞥅𞤩𞤼𞤭𞥅."
tokenized_adlam = llama_tokeniser(sample_adlam_text)
print("Tokenized Adlam Text:", tokenized_adlam)

Tokenized Adlam Text: {'input_ids': [128000, 172, 252, 97, 95, 109697, 252, 97, 110, 6, 172, 252, 97, 114, 172, 252, 97, 113, 172, 252, 97, 102, 172, 252, 98, 228, 172, 252, 97, 255, 172, 252, 97, 110, 172, 252, 97, 104, 172, 252, 98, 227, 109697, 252, 97, 104, 109697, 252, 97, 114, 172, 252, 97, 255, 172, 252, 97, 98, 172, 252, 97, 107, 172, 252, 97, 255, 109697, 252, 97, 105, 172, 252, 97, 113, 172, 252, 97, 97, 172, 252, 97, 102, 172, 252, 97, 104, 109697, 252, 97, 114, 172, 252, 97, 255, 172, 252, 97, 98, 172, 252, 97, 107, 172, 252, 97, 255, 109697, 252, 97, 97, 172, 252, 97, 104, 172, 252, 97, 100, 109697, 252, 97, 95, 172, 252, 97, 100, 172, 252, 97, 95, 172, 252, 97, 98, 172, 252, 97, 95, 172, 252, 98, 226, 172, 252, 97, 110, 109697, 252, 97, 111, 172, 252, 97, 106, 172, 252, 98, 227, 172, 252, 97, 96, 172, 252, 97, 104, 13, 172, 252, 100701, 172, 252, 97, 106, 172, 252, 97, 110, 172, 252, 97, 106, 109697, 252, 97, 112, 172, 252, 97, 106, 172, 252, 97, 110, 172, 252, 97, 120, 1

In [104]:
# Get vocabularies from both tokenizers
llama_vocab = llama_tokenizer.get_vocab()
adlam_vocab = adlam_tokenizer.get_vocab()

In [107]:
# Merge vocabularies by adding all Adlam tokens to LLaMA's vocabulary
combined_vocab = llama_vocab.copy()
combined_vocab.update(adlam_vocab)  # Add all Adlam tokens

In [108]:
# Update LLaMA tokenizer with the combined vocab
llama_tokenizer.add_tokens(list(adlam_vocab.keys()))


32000

In [109]:
# Add special tokens if necessary
special_tokens = {
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    'bos_token': '<bos>',
    'eos_token': '<eos>',
}

llama_tokenizer.add_special_tokens(special_tokens)

4

In [110]:
# Save the extended tokenizer
output_dir = "/kaggle/working/extended_llama_adlam_tokenizer"
llama_tokenizer.save_pretrained(output_dir)

print("Extended LLaMA tokenizer with all Adlam tokens saved at:", output_dir)

Extended LLaMA tokenizer with all Adlam tokens saved at: /kaggle/working/extended_llama_adlam_tokenizer


In [113]:
# Test with some sample text in Adlam and LLaMA
sample_adlam_text = "𞤢 𞤲'𞤶𞤵𞤩𞥆𞤭𞤲𞤫𞥅 𞤫 𞤶𞤭𞤥𞤯𞤭 𞤬𞤵𞤤𞤩𞤫 𞤶𞤭𞤥𞤯𞤭 𞤤𞤫𞤧 𞤢𞤧𞤢𞤥𞤢𞥄𞤲 𞤳𞤮𞥅𞤣𞤫.𞤑𞤮𞤲𞤮 𞤴𞤮𞤲𞤼𞤢𞥄𞤶𞤭 𞤴𞤮𞥅𞤪𞤮 𞤺𞤮𞥅𞤼𞤮, 𞤥𞤢𞥄𞤴𞤮𞥅𞤶𞤭 𞤯𞤭𞥅 𞤩𞤵𞥅𞤩𞤼𞤭𞥅, 𞤸𞤵𞤯𞤮 𞤩𞤵𞥅𞤩𞤼𞤭𞥅."
tokenized_adlam = llama_tokenizer(sample_adlam_text)
print("Tokenized Adlam Text:", tokenized_adlam)

sample_llama_text = "E gila ndeen, daartol suka debbo oo e nagge mum lewru ina yahra e duuɓi 18, ina njuɓɓinee e jimɗi fulɓe jimɗi les asamaan koode."
tokenized_llama = llama_tokenizer(sample_llama_text)
print("Tokenized LLaMA Text:", tokenized_llama)


Tokenized Adlam Text: {'input_ids': [128000, 135065, 220, 141096, 6, 134527, 172, 252, 98, 228, 151709, 172, 252, 98, 227, 220, 136933, 220, 140572, 220, 139655, 220, 140572, 220, 131606, 220, 150108, 133775, 172, 252, 98, 226, 141096, 220, 139015, 172, 252, 98, 227, 138603, 13, 129421, 220, 139548, 172, 252, 98, 226, 149158, 220, 156422, 172, 252, 98, 227, 144009, 220, 154500, 172, 252, 98, 227, 144303, 11, 220, 133775, 172, 252, 98, 226, 156422, 172, 252, 98, 227, 149158, 220, 148538, 172, 252, 98, 227, 220, 135708, 172, 252, 98, 227, 143486, 172, 252, 98, 227, 11, 220, 144591, 220, 135708, 172, 252, 98, 227, 143486, 172, 252, 98, 227, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Token

In [114]:
# Decode the tokens back into text
decoded_adlam = llama_tokenizer.decode(tokenized_adlam['input_ids'])
print("Decoded Adlam Text:", decoded_adlam)

decoded_llama = llama_tokenizer.decode(tokenized_llama['input_ids'])
print("Decoded LLaMA Text:", decoded_llama)


Decoded Adlam Text: <|begin_of_text|>𞤢 𞤲'𞤶𞤵𞤩𞥆𞤭𞤲𞤫𞥅 𞤫 𞤶𞤭𞤥𞤯𞤭 𞤬𞤵𞤤𞤩𞤫 𞤶𞤭𞤥𞤯𞤭 𞤤𞤫𞤧 𞤢𞤧𞤢𞤥𞤢𞥄𞤲 𞤳𞤮𞥅𞤣𞤫.𞤑𞤮𞤲𞤮 𞤴𞤮𞤲𞤼𞤢𞥄𞤶𞤭 𞤴𞤮𞥅𞤪𞤮 𞤺𞤮𞥅𞤼𞤮, 𞤥𞤢𞥄𞤴𞤮𞥅𞤶𞤭 𞤯𞤭𞥅 𞤩𞤵𞥅𞤩𞤼𞤭𞥅, 𞤸𞤵𞤯𞤮 𞤩𞤵𞥅𞤩𞤼𞤭𞥅.
Decoded LLaMA Text: <|begin_of_text|>E gila ndeen, daartol suka debbo oo e nagge mum lewru ina yahra e duuɓi 18, ina njuɓɓinee e jimɗi fulɓe jimɗi les asamaan koode.


In [115]:
# Encoding the Adlam text
encoded_adlam = llama_tokenizer.encode(sample_adlam_text)
print("Encoded Adlam Text:", encoded_adlam)

# Encoding the LLaMA text
encoded_llama = llama_tokenizer.encode(sample_llama_text)
print("Encoded LLaMA Text:", encoded_llama)


Encoded Adlam Text: [128000, 135065, 220, 141096, 6, 134527, 172, 252, 98, 228, 151709, 172, 252, 98, 227, 220, 136933, 220, 140572, 220, 139655, 220, 140572, 220, 131606, 220, 150108, 133775, 172, 252, 98, 226, 141096, 220, 139015, 172, 252, 98, 227, 138603, 13, 129421, 220, 139548, 172, 252, 98, 226, 149158, 220, 156422, 172, 252, 98, 227, 144009, 220, 154500, 172, 252, 98, 227, 144303, 11, 220, 133775, 172, 252, 98, 226, 156422, 172, 252, 98, 227, 149158, 220, 148538, 172, 252, 98, 227, 220, 135708, 172, 252, 98, 227, 143486, 172, 252, 98, 227, 11, 220, 144591, 220, 135708, 172, 252, 98, 227, 143486, 172, 252, 98, 227, 13]
Encoded LLaMA Text: [128000, 36, 342, 72, 75, 64, 220, 77, 67, 2176, 77, 11, 294, 64, 64, 3423, 78, 75, 220, 82, 84, 74, 64, 294, 68, 6194, 78, 220, 2689, 220, 68, 220, 77, 64, 14736, 68, 296, 84, 76, 326, 68, 19239, 84, 220, 72, 77, 64, 220, 88, 64, 4171, 64, 220, 68, 294, 84, 84, 133, 241, 72, 220, 972, 11, 220, 72, 77, 64, 220, 77, 73, 84, 133, 241, 133, 241, 7

In [116]:
# Decoding the encoded Adlam text
decoded_adlam = llama_tokenizer.decode(encoded_adlam)
print("Decoded Adlam Text:", decoded_adlam)

# Decoding the encoded LLaMA text
decoded_llama = llama_tokenizer.decode(encoded_llama)
print("Decoded LLaMA Text:", decoded_llama)


Decoded Adlam Text: <|begin_of_text|>𞤢 𞤲'𞤶𞤵𞤩𞥆𞤭𞤲𞤫𞥅 𞤫 𞤶𞤭𞤥𞤯𞤭 𞤬𞤵𞤤𞤩𞤫 𞤶𞤭𞤥𞤯𞤭 𞤤𞤫𞤧 𞤢𞤧𞤢𞤥𞤢𞥄𞤲 𞤳𞤮𞥅𞤣𞤫.𞤑𞤮𞤲𞤮 𞤴𞤮𞤲𞤼𞤢𞥄𞤶𞤭 𞤴𞤮𞥅𞤪𞤮 𞤺𞤮𞥅𞤼𞤮, 𞤥𞤢𞥄𞤴𞤮𞥅𞤶𞤭 𞤯𞤭𞥅 𞤩𞤵𞥅𞤩𞤼𞤭𞥅, 𞤸𞤵𞤯𞤮 𞤩𞤵𞥅𞤩𞤼𞤭𞥅.
Decoded LLaMA Text: <|begin_of_text|>E gila ndeen, daartol suka debbo oo e nagge mum lewru ina yahra e duuɓi 18, ina njuɓɓinee e jimɗi fulɓe jimɗi les asamaan koode.


In [121]:
from transformers import PreTrainedTokenizerFast

# Load your fast tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("/kaggle/working/extended_llama_adlam_tokenizer")

# Push it to the Hugging Face Hub
tokenizer.push_to_hub("Pullo-Africa-Protagonist/ADLaM-Tokenizer")


tokenizer.json:   0%|          | 0.00/23.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Pullo-Africa-Protagonist/ADLaM-Tokenizer/commit/a215376f920ac11c5c5858288303f0556824afa3', commit_message='Upload tokenizer', commit_description='', oid='a215376f920ac11c5c5858288303f0556824afa3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Pullo-Africa-Protagonist/ADLaM-Tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='Pullo-Africa-Protagonist/ADLaM-Tokenizer'), pr_revision=None, pr_num=None)