In [None]:
# Step 1: Verify Tokenizer Compatibility
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# Load your Hindi tokenizer
hindi_tokenizer_path = "./hindi_tokenizer"
hindi_tokenizer = AutoTokenizer.from_pretrained(hindi_tokenizer_path)

# Load the original model's tokenizer
original_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
original_tokenizer = AutoTokenizer.from_pretrained(original_model_id)

# Compare vocabulary sizes and special tokens
print(f"Hindi tokenizer vocab size: {len(hindi_tokenizer)}")
print(f"Original tokenizer vocab size: {len(original_tokenizer)}")

print("Hindi tokenizer special tokens:")
print(hindi_tokenizer.special_tokens_map)
print("Original tokenizer special tokens:")
print(original_tokenizer.special_tokens_map)

# Ensure all necessary special tokens are present
required_special_tokens = ['pad_token', 'bos_token', 'eos_token', 'unk_token', 'mask_token']
for token in required_special_tokens:
    if getattr(hindi_tokenizer, f"{token}_id") is None:
        print(f"Warning: Hindi tokenizer is missing {token}")

In [None]:

# Step 2: Check Tokenizer Conversion
# This step is to ensure your conversion from ByteLevelBPETokenizer to PreTrainedTokenizerFast is correct
# Add this to your train_tokenizer.py script after converting to PreTrainedTokenizerFast

def verify_tokenizer_conversion(fast_tokenizer, original_tokenizer):
    # Check if all special tokens are preserved
    for token_name, token in original_tokenizer.special_tokens_map.items():
        if token_name not in fast_tokenizer.special_tokens_map:
            print(f"Warning: {token_name} is missing in the converted tokenizer")
    
    # Test encoding and decoding
    test_text = "नमस्ते, यह एक परीक्षण वाक्य है।"
    encoded = fast_tokenizer.encode(test_text)
    decoded = fast_tokenizer.decode(encoded)
    print(f"Original: {test_text}")
    print(f"Encoded: {encoded}")
    print(f"Decoded: {decoded}")
    
    if test_text != decoded:
        print("Warning: Encoding and decoding are not reversible")

# Use this function after converting your tokenizer
verify_tokenizer_conversion(fast_tokenizer, tokenizer)

In [None]:

# Step 3: Adjust Model Configuration
# In your run.py, after remapping the model

def adjust_model_config(model, new_tokenizer):
    model.config.vocab_size = len(new_tokenizer)
    model.config.pad_token_id = new_tokenizer.pad_token_id
    model.config.bos_token_id = new_tokenizer.bos_token_id
    model.config.eos_token_id = new_tokenizer.eos_token_id
    model.config.sep_token_id = new_tokenizer.sep_token_id
    model.config.cls_token_id = new_tokenizer.cls_token_id
    model.resize_token_embeddings(len(new_tokenizer))
    return model

model = adjust_model_config(model, new_tokenizer)

In [None]:

# Step 4: Debug Token Mapping
# Add this to your run.py after the map_tokens function

def debug_token_mapping(tokenized_possible_translations, source_tokenizer, target_tokenizer, num_samples=10):
    print("Debugging token mappings:")
    for i, (source_token, target_tokens) in enumerate(tokenized_possible_translations.items()):
        if i >= num_samples:
            break
        print(f"Source token: {source_tokenizer.decode([source_token])}")
        print("Possible translations:")
        for target_token, probability in target_tokens.items():
            print(f"  {target_tokenizer.decode([target_token])}: {probability:.4f}")
        print()

debug_token_mapping(tokenized_possible_translations, AutoTokenizer.from_pretrained(source_model), new_tokenizer)

In [None]:

# Step 5: Test Tokenizer Separately
# Create a new file named test_tokenizer.py

from transformers import AutoTokenizer

def test_tokenizer(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    
    test_sentences = [
        "नमस्ते, आप कैसे हैं?",
        "मैं हिंदी सीख रहा हूं।",
        "भारत एक विविधतापूर्ण देश है।"
    ]
    
    for sentence in test_sentences:
        encoded = tokenizer.encode(sentence)
        decoded = tokenizer.decode(encoded)
        print(f"Original: {sentence}")
        print(f"Encoded: {encoded}")
        print(f"Decoded: {decoded}")
        print(f"Match: {sentence == decoded}")
        print()

if __name__ == "__main__":
    test_tokenizer("./hindi_tokenizer")
    

In [None]:

# Step 6: Gradual Testing
# In run.py, replace the source_model with a smaller model

source_model = "gpt2"  # or "EleutherAI/gpt-neo-125M" for a smaller GPT-Neo model


In [None]:
# In llama3_test.py

# Modify your llama3_test.py script to test different generation parameters:

def generate_text(model, tokenizer, prompt, params):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    
    output = model.generate(
        input_ids,
        **params
    )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test different generation parameters
test_params = [
    {"max_length": 50, "do_sample": False},  # Greedy decoding
    {"max_length": 50, "do_sample": True, "temperature": 0.7, "top_p": 0.9},
    {"max_length": 50, "do_sample": True, "temperature": 0.9, "top_k": 50},
]

prompt = "नमस्ते, मेरा नाम"

for i, params in enumerate(test_params):
    print(f"Test {i+1}:")
    print(f"Parameters: {params}")
    generated_text = generate_text(model, tokenizer, prompt, params)
    print(f"Generated text: {generated_text}")
    print()

In [None]:
# Step 8: Check Model Output
# Add this to your llama3_test.py to inspect the raw model output:
def inspect_model_output(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model(input_ids)
    
    logits = output.logits[0, -1, :]
    top_token_ids = torch.topk(logits, k=10).indices.tolist()
    
    print("Top 10 predicted tokens:")
    for token_id in top_token_ids:
        token = tokenizer.decode([token_id])
        print(f"Token ID: {token_id}, Token: {token}, Logit: {logits[token_id].item():.4f}")

# Use this function in your script
inspect_model_output(model, tokenizer, "नमस्ते, मेरा नाम")