# WE TEST DIFFERENT TRANSLATION MODELS FROM HERE:
https://huggingface.co/models?pipeline_tag=translation

# INSTALL DEPEDENCIES AND IMPORT LIBRARIES

In [1]:
!pip install -q huggingface_hub datasets transformers sentencepiece accelerate -U

In [2]:
from datasets import load_dataset, DatasetDict, Dataset #,Translation,  Features
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import numpy as np
import re
import json
import pandas as pd
from collections import Counter
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# READ DATA

In [3]:
# Load the secret
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")  # Or "HFtoken" if that's the correct key

# Log in to Hugging Face
login(token=hf_token)
print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


In [4]:
repo_name = "data354/Koumankan_mt_dyu_fr"
dataset_orig = load_dataset(repo_name)
dataset_orig

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 8065
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1471
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
})

# FORMAT DATA


In [5]:
## Data preprocessing
src_lang = "dyu"
trg_lang = "fr"
# Define the characters to remove or replace
chars_to_remove_regex = '[!"&\(\),-./:;=?+.\n\[\]«»]'

def remove_special_characters(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters based on regex
    text = re.sub(chars_to_remove_regex, ' ', text)

    # Replace multiple consecutive dots with a single dot
    text = re.sub(r'\.{2,}', '', text)

    # Replace typographic apostrophe with straight apostrophe
    text = text.replace("’", "'")

    # Remove em dashes or other dashes if needed
    text = text.replace('—', '')

    # Replace ellipses with a single dot or handle as needed
    text = text.replace('…', '')

    # Remove extra white spaces (convert multiple spaces to a single space)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def clean_text(batch):
    # process source text
    batch['translation'][src_lang] = remove_special_characters(batch['translation'][src_lang])
    # process target text
    batch['translation'][trg_lang] = remove_special_characters(batch['translation'][trg_lang])

    return batch

# Assuming `data` is a DatasetDict or similar structure
dataset_orig = dataset_orig.map(clean_text)
dataset_orig

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 8065
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1471
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
})

In [6]:
# Load your dataset
dataset = dataset_orig # Replace with your dataset name or path

# Define the path where you want to save the CSV files
train_csv_path = 'train.csv'
validation_csv_path = 'validation.csv'
test_csv_path = 'test.csv'

# Convert each split to a pandas DataFrame and then export to CSV
dataset['train'].to_pandas().to_csv(train_csv_path, index=False)
dataset['validation'].to_pandas().to_csv(validation_csv_path, index=False)
dataset['test'].to_pandas().to_csv(test_csv_path, index=False)

print(f"Data exported to {train_csv_path}, {validation_csv_path}, and {test_csv_path}")

Data exported to train.csv, validation.csv, and test.csv


In [7]:
# Extract the data
train_data = dataset_orig['train'].to_dict()
validation_data = dataset_orig['validation'].to_dict()

# Number of validation samples to move
num_samples_to_move = 1461

# Select the first 1461 samples from validation to move to train
samples_to_add_to_train = {
    'ID': validation_data['ID'][:num_samples_to_move],
    'translation': validation_data['translation'][:num_samples_to_move]
}

# Update the validation data to keep only 10 samples
updated_validation_data = {
    'ID': validation_data['ID'][num_samples_to_move:],
    'translation': validation_data['translation'][num_samples_to_move:]
}

# Add the selected validation samples to the train data
updated_train_data = {
    'ID': train_data['ID'] + samples_to_add_to_train['ID'],
    'translation': train_data['translation'] + samples_to_add_to_train['translation']
}

# Update the validation data to keep only the first 10 samples from the remaining
updated_validation_data = {
    'ID': updated_validation_data['ID'][:10],
    'translation': updated_validation_data['translation'][:10]
}

# Create new Dataset objects
updated_train_dataset = Dataset.from_dict(updated_train_data)
updated_validation_dataset = Dataset.from_dict(updated_validation_data)

# Create the new DatasetDict with the updated splits
updated_dataset_dict = DatasetDict({
    'train': updated_train_dataset,
    'validation': updated_validation_dataset,
    'test': dataset_orig['test']
})

# Print the updated DatasetDict
updated_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 9526
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 10
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
})

In [8]:
 # Convert each split to a pandas DataFrame
small=updated_dataset_dict
train_df = pd.DataFrame(small['train'])
validation_df = pd.DataFrame(small['validation'])
test_df = pd.DataFrame(small['test'])

# Function to split 'translation' column
def split_translation(df):
    translations = pd.json_normalize(df['translation'])
    df = df.drop(columns=['translation'])
    df = pd.concat([df, translations], axis=1)
    #df = df.drop_duplicates(subset=['dyu', 'fr'])
    return df

# Apply the function to each DataFrame
train_df = split_translation(train_df)
train_df = train_df.drop_duplicates(subset=['dyu', 'fr'])
train_df = train_df.drop_duplicates(subset=['dyu'])

validation_df = split_translation(validation_df)
validation_df = validation_df.drop_duplicates(subset=['dyu', 'fr'])
validation_df = validation_df.drop_duplicates(subset=['dyu'])

test_df = split_translation(test_df)
print(len(test_df))
test_df = test_df.drop_duplicates(subset=['dyu', 'fr'])
test_df = test_df.drop_duplicates(subset=['dyu'])
print(len(test_df))

# Print the DataFrames to verify
print(f"Train DataFrame whose length is {len(train_df)}:")
display(train_df.head(2))

print(f"\nValidation DataFrame whose length is {len(validation_df)}:")
display(validation_df.head(2))

print(f"\nTest DataFrame whose length is {len(test_df)}:")
display(test_df.head(2))
# Combine the 'dyu' columns from train_df, test_df, and validation into a single DataFrame
dyu_text = pd.concat([train_df['dyu'], test_df['dyu'], validation_df['dyu']], ignore_index=True)

# Combine the 'fr' columns from train_df,  and validation into a single DataFrame
fr_text = pd.concat([train_df['fr'], validation_df['fr']], ignore_index=True)

# Combine the 'fr' & dyu columns from train_df, and validation into a single DataFrame
smpl=pd.concat([validation_df,train_df])
smpl

1393
1392
Train DataFrame whose length is 9433:


Unnamed: 0,ID,dyu,fr
0,ID_18897661270129,a bi ji min na,il boit de l'eau
1,ID_18479132727846,a le dalakolontɛ lon bɛ,il se plaint toujours



Validation DataFrame whose length is 10:


Unnamed: 0,ID,dyu,fr
0,ID_19561563644823,nou nou ti na na tougou,la mer de behring
1,ID_17379610645803,madam kalvɛz ma,à madame calvez



Test DataFrame whose length is 1392:


Unnamed: 0,ID,dyu,fr
0,ID_17345911362699,an kelen duron le tun be yi,0
1,ID_173626847.3381,o ka papiye farana,0


Unnamed: 0,ID,dyu,fr
0,ID_19561563644823,nou nou ti na na tougou,la mer de behring
1,ID_17379610645803,madam kalvɛz ma,à madame calvez
2,ID_17730914646773,duguma misaliya do la,tout bas par exemple
3,ID_18407182647706,a tun be se ka lɔgɔkun kelen kɛ a te na,il s'absentait parfois une semaine entière
4,ID_17445356651816,a bé waga joli le sɔrɔ la,combien d'argent gagnez vous
...,...,...,...
9521,ID_17352313636693,ne sera kalan nan,je suis en train de lire
9522,ID_18824297638014,belébele wa fiman,le grand ou le petit
9523,ID_18342329640632,bara kaɲi ka kɛra,il faut y travailler
9524,ID_18064988641972,an wa denbaya ye denbayadeni ye,notre famille est une petite famille


# UNDERSTAND MODEL

In [9]:
tasks=['text-classification','translation']
def generate_hg_model_links(base_link, start_page, end_page, sort_key="&sort=trending"):
    links = []
    for page in range(start_page, end_page + 1):
        if page > 1:
            url = f"{base_link}&p={page}{sort_key}"
        else:
            url = f"{base_link}{sort_key}"
        links.append(url)
    return links

# Example usage:
base_hg_model_link = "https://huggingface.co/models?pipeline_tag=translation"
hg_model_start = 1
hg_model_end =2

# Call the function to get the list of URLs
links = generate_hg_model_links(base_hg_model_link, hg_model_start, hg_model_end)

# Print or use the links as needed
for link in links:
    print(link)

https://huggingface.co/models?pipeline_tag=translation&sort=trending
https://huggingface.co/models?pipeline_tag=translation&p=2&sort=trending


In [10]:
def fetch_model_names(links):
    model_names = []
    for link in links:
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Loop through the 30 article elements
        for article_num in range(1, 31):
            # Use a CSS selector to find the h4 tag within each article
            selector = f"section:nth-of-type(2) div:nth-of-type(3) article:nth-of-type({article_num}) a div header"
            model_tag = soup.select_one(selector)

            if model_tag:
                model_name = model_tag.text.strip()
                model_names.append(model_name)


    return model_names

# Example usage
models = fetch_model_names(links)
print("Total models:",len(models),"\nTop 5 Models:",models[:5])

Total models: 60 
Top 5 Models: ['ByteDance-Seed/Seed-X-PPO-7B', 'ByteDance-Seed/Seed-X-Instruct-7B', 'facebook/nllb-200-distilled-600M', 'sarvamai/sarvam-translate', 'pfnet/plamo-2-translate']


In [11]:
def msize(m):
    return sum(p.numel() for p in m.parameters())

def word_tokenize(text):
    return re.findall(r'(\w+|[^\w\s])', text)

In [12]:
def get_model_metrics(model):
    """
    Extract various metrics from the model, handling different architectures gracefully.
    """
    model_metrics = {}

    try:
        model_metrics['total_parameters'] = msize(model)

        if hasattr(model, 'model') and hasattr(model.model, 'shared'):
            model_metrics['shared_embedding_parameters'] = msize(model.model.shared)
        elif hasattr(model, 'shared'):
            model_metrics['shared_embedding_parameters'] = msize(model.shared)
        else:
            model_metrics['shared_embedding_parameters'] = None

        if hasattr(model, 'model') and hasattr(model.model, 'encoder'):
            model_metrics['encoder_parameters'] = msize(model.model.encoder)
        elif hasattr(model, 'encoder'):
            model_metrics['encoder_parameters'] = msize(model.encoder)
        else:
            model_metrics['encoder_parameters'] = None

        if hasattr(model, 'model') and hasattr(model.model.encoder, 'layers'):
            model_metrics['encoder_layers_parameters'] = msize(model.model.encoder.layers)
        elif hasattr(model, 'encoder') and hasattr(model.encoder, 'layers'):
            model_metrics['encoder_layers_parameters'] = msize(model.encoder.layers)
        else:
            model_metrics['encoder_layers_parameters'] = None

        if hasattr(model, 'model') and hasattr(model.model, 'decoder'):
            model_metrics['decoder_parameters'] = msize(model.model.decoder)
        elif hasattr(model, 'decoder'):
            model_metrics['decoder_parameters'] = msize(model.decoder)
        else:
            model_metrics['decoder_parameters'] = None

        if hasattr(model, 'model') and hasattr(model.model.decoder, 'layers'):
            model_metrics['decoder_layers_parameters'] = msize(model.model.decoder.layers)
        elif hasattr(model, 'decoder') and hasattr(model.decoder, 'layers'):
            model_metrics['decoder_layers_parameters'] = msize(model.decoder.layers)
        else:
            model_metrics['decoder_layers_parameters'] = None

        if hasattr(model, 'lm_head'):
            model_metrics['lm_head_parameters'] = msize(model.lm_head)
        else:
            model_metrics['lm_head_parameters'] = None

    except Exception as e:
        model_metrics['error'] = str(e)

    return model_metrics

In [13]:
import concurrent.futures
import time
import logging
import multiprocessing
import time

def load_model(model_name, return_dict):
    """Function to load the model within a separate process."""
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name,trust_remote_code=True)
        return_dict['model'] = model
    except Exception as e:
        return_dict['error'] = str(e)

def load_model_with_timeout(model_name, timeout=30):
    """Load a model with a timeout using multiprocessing."""
    manager = multiprocessing.Manager()
    return_dict = manager.dict()
    p = multiprocessing.Process(target=load_model, args=(model_name, return_dict))
    p.start()
    p.join(timeout)

    if p.is_alive():
        p.terminate()
        p.join()
        return None, f"Loading the model exceeded {timeout} seconds."

    if 'error' in return_dict:
        return None, return_dict['error']

    return return_dict.get('model'), None

def analyze_models(models, dyu_text, fr_text, smpl):
    metrics = {}
    for model_name in models:
        try:
            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Load model with a timeout
            model, error = load_model_with_timeout(model_name, timeout=30)

            if error:
                metrics[model_name] = {'error': error}
                continue  # Skip further processing for this model

            # Calculate model metrics
            model_metrics = get_model_metrics(model)

            # Calculate proportions
            if model_metrics.get('total_parameters') > 0:
                model_metrics['shared_embedding_proportion'] = (
                    model_metrics.get('shared_embedding_parameters', 0) / model_metrics['total_parameters']
                ) if model_metrics.get('shared_embedding_parameters') else None
                model_metrics['lm_head_proportion'] = (
                    model_metrics.get('lm_head_parameters', 0) / model_metrics['total_parameters']
                ) if model_metrics.get('lm_head_parameters') else None

            # Tokenizer metrics
            cnt_dyu = Counter()
            cnt_fr = Counter()

            for text in tqdm(dyu_text):
                cnt_dyu.update(tokenizer.encode(text))

            for text in tqdm(fr_text):
                cnt_fr.update(tokenizer.encode(text))

            tokenizer_metrics = {
                'vocab_size': tokenizer.vocab_size,
                'dyula_tokens_count': len(cnt_dyu),
                'dyula_tokens_proportion': len(cnt_dyu) / tokenizer.vocab_size,
                'french_tokens_count': len(cnt_fr),
                'french_tokens_proportion': len(cnt_fr) / tokenizer.vocab_size,
                'common_tokens_count': len(set(cnt_dyu.keys()).intersection(set(cnt_fr.keys()))),
                'dyula_common_tokens_proportion': len(set(cnt_dyu.keys()).intersection(set(cnt_fr.keys()))) / len(cnt_dyu)
            }

            # Process the dataframe for token stats
            smpl['dyula_toks'] = smpl.dyu.apply(tokenizer.tokenize)
            smpl['french_toks'] = smpl.fr.apply(tokenizer.tokenize)
            smpl['dyula_words'] = smpl.dyu.apply(word_tokenize)
            smpl['french_words'] = smpl.fr.apply(word_tokenize)
            stats = smpl[['dyula_toks', 'french_toks', 'dyula_words', 'french_words']].applymap(len).describe()

            token_stats = {
                'french_toks_to_french_words_ratio': stats.french_toks['mean'] / stats.french_words['mean'],
                'dyula_toks_to_dyula_words_ratio': stats.dyula_toks['mean'] / stats.dyula_words['mean']
            }

            # Combined text processing for characters
            combined_set = set(dyu_text) | set(smpl.dyu) | set(smpl.fr)
            all_texts = list(combined_set)
            chars_cnt = Counter(c for t in all_texts for c in t)
            required_chars = ''.join([k for k, v in chars_cnt.most_common() if v >= 3 and k not in ' '])

            texts_with_unk_dyu = [text for text in tqdm(smpl.dyu) if tokenizer.unk_token_id in tokenizer(text).input_ids]
            texts_with_unk_fr = [text for text in tqdm(smpl.fr) if tokenizer.unk_token_id in tokenizer(text).input_ids]

            # Aggregate all metrics
            metrics[model_name] = {
                'model_metrics': model_metrics,
                'tokenizer_metrics': tokenizer_metrics,
                'token_stats': token_stats,
                'texts_with_unk_dyu': len(texts_with_unk_dyu),
                'texts_with_unk_fr': len(texts_with_unk_fr),
                'required_chars': required_chars
            }

        except Exception as e:
            metrics[model_name] = {'error': str(e)}

    # Write metrics to a JSON file
    with open('model_metrics.json', 'w') as f:
        json.dump(metrics, f, indent=4)

    return metrics

# Example usage
analyze_models(models, dyu_text, fr_text, smpl)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
2025-07-24 12:23:37.343103: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 12:23:37.343179: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-24 12:23:37

The repository pfnet/plamo-2-translate contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/pfnet/plamo-2-translate .
 You can inspect the repository content at https://hf.co/pfnet/plamo-2-translate.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  Y


tokenization_plamo.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/pfnet/plamo-2-translate:
- tokenization_plamo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.jsonl:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/pfnet/plamo-2-translate:
- modeling_plamo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  module_spec.loader.exec_module(module)
  module_spec.loader.exec_module(module)
2025-07-24 12:24:23.130623: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 12:24:23.130687: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-24 12:24:23.132628: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

2025-07-24 12:24:28.737408: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 12:24:28.737472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-24 12:24:28.739351: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-24 12:24:34.451851: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 12:24:34.451913: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

  0%|          | 0/10835 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

  stats = smpl[['dyula_toks', 'french_toks', 'dyula_words', 'french_words']].applymap(len).describe()


  0%|          | 0/9443 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.43M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/602k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/450k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/833k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/10835 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

  stats = smpl[['dyula_toks', 'french_toks', 'dyula_words', 'french_words']].applymap(len).describe()


  0%|          | 0/9443 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/782k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json: 0.00B [00:00, ?B/s]

The repository ai4bharat/indictrans2-en-indic-dist-200M contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/ai4bharat/indictrans2-en-indic-dist-200M .
 You can inspect the repository content at https://hf.co/ai4bharat/indictrans2-en-indic-dist-200M.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  N


tokenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.43M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
A new version of the following files was downloaded from https://huggingface.co/ltg/nort5-large-en-no-translation:
- configuration_nort5.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_nort5.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ltg/nort5-large-en-no-translation:
- modeling_nort5.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS

  0%|          | 0/10835 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

  stats = smpl[['dyula_toks', 'french_toks', 'dyula_words', 'french_words']].applymap(len).describe()


  0%|          | 0/9443 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/831k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/841k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/832k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/808k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/784k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/819k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/997k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/851k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/903k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/829k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/898k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/312k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/304k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/923k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/831k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/918k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/899k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/896k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/899k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/897k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/925k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/897k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/847k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/810k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/682k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/856k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/684k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/682k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/687k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/835k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/686k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/678k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/781k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/470k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/451k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/10835 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

  stats = smpl[['dyula_toks', 'french_toks', 'dyula_words', 'french_words']].applymap(len).describe()


  0%|          | 0/9443 [00:00<?, ?it/s]

  0%|          | 0/9443 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/308k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/307k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/309k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/306k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/835k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/830k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/868k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/830k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/821k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/830k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/850k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/830k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/887k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'ByteDance-Seed/Seed-X-PPO-7B': {'error': "Unrecognized configuration class <class 'transformers.models.mistral.configuration_mistral.MistralConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.\nModel type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, GraniteSpeechConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, Qwen2AudioConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, T5GemmaConfig, UMT5Config, XLMProphetNetConfig."},
 'ByteDance-Seed/Seed-X-Instruct-7B': {'error': "Unrecognized configuration class <class 'transformers.models.mistral.configuration_mistral.MistralConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.\nModel type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallCon

In [14]:
import concurrent.futures
import time

def load_model_with_timeout(model_name, timeout=30):
    """Load a model with a timeout."""
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(AutoModelForSeq2SeqLM.from_pretrained, model_name)
        try:
            model = future.result(timeout=timeout)
            return model, None  # Return the model and no error
        except concurrent.futures.TimeoutError:
            logging.error(f"Loading the model exceeded {timeout} seconds.")
            future.cancel()  # Attempt to cancel the future
            return None, f"Loading the model exceeded {timeout} seconds."
        except Exception as e:
            logging.error(f"An error occurred: {e}")
            return None, str(e)  # Return no model and the error


def analyze_models(models, dyu_text, fr_text, smpl):
    metrics = {}
    for model_name in models:
        #try:
            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Load model with a timeout
            model, error = load_model_with_timeout(model_name, timeout=30)

            if error:
                metrics[model_name] = {'error': error}
                continue  # Skip further processing for this model

            # Calculate model metrics
            model_metrics = get_model_metrics(model)

            pass

In [15]:
with open('model_metrics.json', 'r') as f:
    data = json.load(f)
def flatten_metrics(metrics):
    flat_metrics = {}

    for model, values in metrics.items():
        if 'error' in values:
            flat_metrics[model] = {
                'total_parameters': None,
                'vocab_size': None,
                'dyula_tokens_count': None,
                'dyula_tokens_proportion': None,
                'french_tokens_count': None,
                'french_tokens_proportion': None,
                'common_tokens_count': None,
                'dyula_common_tokens_proportion': None,
                'french_toks_to_french_words_ratio': None,
                'dyula_toks_to_dyula_words_ratio': None,
                'texts_with_unk_dyu': None,
                'texts_with_unk_fr': None,
                'shared_embedding_parameters': None,
                'encoder_parameters': None,
                'encoder_layers_parameters': None,
                'decoder_parameters': None,
                'decoder_layers_parameters': None,
                'lm_head_parameters': None,
                'shared_embedding_proportion': None,
                'lm_head_proportion': None,
                'required_chars': None,
                'error': values['error']
            }
        else:
            model_data = values
            model_metrics = model_data.get('model_metrics', {})
            tokenizer_metrics = model_data.get('tokenizer_metrics', {})
            token_stats = model_data.get('token_stats', {})

            flat_metrics[model] = {
                'total_parameters': model_metrics.get('total_parameters'),
                'vocab_size': tokenizer_metrics.get('vocab_size'),
                'dyula_tokens_count': tokenizer_metrics.get('dyula_tokens_count'),
                'dyula_tokens_proportion': tokenizer_metrics.get('dyula_tokens_proportion'),
                'french_tokens_count': tokenizer_metrics.get('french_tokens_count'),
                'french_tokens_proportion': tokenizer_metrics.get('french_tokens_proportion'),
                'common_tokens_count': tokenizer_metrics.get('common_tokens_count'),
                'dyula_common_tokens_proportion': tokenizer_metrics.get('dyula_common_tokens_proportion'),
                'french_toks_to_french_words_ratio': token_stats.get('french_toks_to_french_words_ratio'),
                'dyula_toks_to_dyula_words_ratio': token_stats.get('dyula_toks_to_dyula_words_ratio'),
                'texts_with_unk_dyu': model_data.get('texts_with_unk_dyu'),
                'texts_with_unk_fr': model_data.get('texts_with_unk_fr'),
                 'shared_embedding_parameters': model_metrics.get('shared_embedding_parameters'),
                'encoder_parameters': model_metrics.get('encoder_parameters'),
                'encoder_layers_parameters': model_metrics.get('encoder_layers_parameters'),
                'decoder_parameters': model_metrics.get('decoder_parameters'),
                'decoder_layers_parameters': model_metrics.get('decoder_layers_parameters'),
                'lm_head_parameters': model_metrics.get('lm_head_parameters'),
                'shared_embedding_proportion': model_metrics.get('shared_embedding_proportion'),
                'lm_head_proportion': model_metrics.get('lm_head_proportion'),
                'required_chars': model_data.get('required_chars'),
                'error': None
            }

    return pd.DataFrame.from_dict(flat_metrics, orient='index')

# Convert JSON data to DataFrame
df = flatten_metrics(data)
df.sort_values(by=['texts_with_unk_dyu', 'texts_with_unk_fr','total_parameters', "dyula_tokens_proportion","french_tokens_proportion",
                   'dyula_toks_to_dyula_words_ratio', 'french_toks_to_french_words_ratio', 'vocab_size',],
               ascending=[True, True, True, False, False, True, True, True], inplace=True)

# Save DataFrame to a CSV file (optional)
df.to_csv('model_metrics.csv', index_label='model_name')

# Display DataFrame
df.head()

Unnamed: 0,total_parameters,vocab_size,dyula_tokens_count,dyula_tokens_proportion,french_tokens_count,french_tokens_proportion,common_tokens_count,dyula_common_tokens_proportion,french_toks_to_french_words_ratio,dyula_toks_to_dyula_words_ratio,...,shared_embedding_parameters,encoder_parameters,encoder_layers_parameters,decoder_parameters,decoder_layers_parameters,lm_head_parameters,shared_embedding_proportion,lm_head_proportion,required_chars,error
iryneko571/mt5-small-translation-ja_zh,300176768.0,250100.0,5225.0,0.020892,7394.0,0.029564,2074.0,0.396938,1.552361,1.833909,...,128057344.0,146940608.0,,153236160.0,,128057344.0,0.426606,0.426606,aenisrlotumdkbcɛɔgypfév'jwhèqzɲàxôêâçû́œîù̀̂ëï...,
Helsinki-NLP/opus-mt-az-en,56586240.0,23288.0,600.0,0.025764,539.0,0.023145,467.0,0.778333,2.890835,2.391959,...,11923456.0,31099904.0,18914304.0,37409792.0,25224192.0,11923456.0,0.210713,0.210713,aenisrlotumdkbcɛɔgypfév'jwhèqzɲàxôêâçû́œîù̀̂ëï...,
google-t5/t5-small,60506624.0,32100.0,2012.0,0.062679,5057.0,0.157539,1544.0,0.767396,1.663242,2.785001,...,16449536.0,35330816.0,,41625344.0,,16449536.0,0.271863,0.271863,aenisrlotumdkbcɛɔgypfév'jwhèqzɲàxôêâçû́œîù̀̂ëï...,
Helsinki-NLP/opus-mt-tc-big-tr-en,236883968.0,57060.0,1882.0,0.032983,1936.0,0.033929,1381.0,0.733794,2.191444,2.03335,...,58429440.0,135055360.0,75577344.0,160258048.0,100780032.0,58429440.0,0.246658,0.246658,aenisrlotumdkbcɛɔgypfév'jwhèqzɲàxôêâçû́œîù̀̂ëï...,
ByteDance-Seed/Seed-X-PPO-7B,,,,,,,,,,,...,,,,,,,,,,Unrecognized configuration class <class 'trans...


In [16]:
from huggingface_hub import HfApi

api = HfApi()
model_info = api.model_info("google-t5/t5-3b")  # Replace "model_name" with the actual model name
print(model_info)
model_size = model_info.get('google-t5/t5-3b', None)

print(f"Model size: {model_size}")

ModelInfo(id='google-t5/t5-3b', author='google-t5', sha='bed96aab9ee46012a5046386105ee5fd0ac572f0', created_at=datetime.datetime(2022, 3, 2, 23, 29, 4, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 1, 29, 15, 44, 49, tzinfo=datetime.timezone.utc), private=False, disabled=False, downloads=1049888, downloads_all_time=None, gated=False, gguf=None, inference=None, inference_provider_mapping=None, likes=46, library_name='transformers', tags=['transformers', 'pytorch', 'tf', 'safetensors', 't5', 'text-generation', 'summarization', 'translation', 'en', 'fr', 'ro', 'de', 'multilingual', 'dataset:c4', 'arxiv:1805.12471', 'arxiv:1708.00055', 'arxiv:1704.05426', 'arxiv:1606.05250', 'arxiv:1808.09121', 'arxiv:1810.12885', 'arxiv:1905.10044', 'arxiv:1910.09700', 'license:apache-2.0', 'autotrain_compatible', 'text-generation-inference', 'endpoints_compatible', 'region:us'], pipeline_tag='translation', mask_token=None, card_data={'base_model': None, 'datasets': ['c4'], 'eval_re

AttributeError: 'ModelInfo' object has no attribute 'get'

In [None]:
import requests

def get_model_size(model_name):
    api_url = f"https://huggingface.co/api/models/{model_name}"
    response = requests.get(api_url)
    if response.status_code == 200:
        model_info = response.json()
        model_size = model_info.get('model_size', None)
        if model_size:
            return model_size / 1e6  # Convert to millions
        else:
            print(f"Model size information not available for {model_name}")
            return None
    else:
        print(f"Failed to fetch model info for {model_name}")
        return None

# Example usage
model_name = "google-t5/t5-3b"
model_size = get_model_size(model_name)
if model_size and model_size > 100:
    print(f"Skipping {model_name} due to size: {model_size:.2f}M parameters")
else:
    print(f"Model {model_name} size: {model_size:.2f}M parameters")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def analyze_models(models, dyu_text, fr_text, smpl):
    metrics = {}
    for model_name in models[:5]:  # [:2]
        try:
            # Check model size before loading
            model_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, return_dict=True)
            model_size = model_info.num_parameters() / 1e6  # Convert to millions
            if model_size > 100:  # Set your size threshold here
                print(f"Skipping {model_name} due to size: {model_size:.2f}M parameters")
                continue

            # Load tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

            # Calculate model metrics
            model_metrics = get_model_metrics(model)
            metrics[model_name] = model_metrics
        except Exception as e:
            print(f"Error loading {model_name}: {e}")
    return metrics


In [None]:
models

In [None]:
from huggingface_hub import HfApi

def get_model_size_from_hf(model_name):
    api = HfApi()
    model_info = api.model_info(model_name)
    #print(model_info)

    # Check if safetensors is not None
    if model_info.safetensors is not None:
        safetensors_info = model_info.safetensors.parameters
        total_size_bytes = safetensors_info.get('F32', 0)
    else:
        total_size_bytes = 0

    # Convert bytes to megabytes (MB)
    total_size_mb = total_size_bytes / (1024 ** 2)
    # Convert bytes to gigabytes (GB)
    total_size_gb = total_size_bytes / (1024 ** 3)

    return {
        "size_in_mb": f"{total_size_mb:.2f} MB",
        "size_in_gb": f"{total_size_gb:.2f} GB"
    }

# Example usage
final_models = []
for model in models:
    model_size = get_model_size_from_hf(model)
    print(f"{model}: Model size: {model_size['size_in_mb']}")
    #print(f"{model}: Model size: {model_size['size_in_gb']}")

    # Convert the size in GB to a float for comparison
    size_in_gb = float(model_size['size_in_gb'].split()[0])

    if size_in_gb < 3:
        print(f"Skipping {model} due to size: {model_size['size_in_gb']}")
        final_models.append(model)
    else:
        print(f"Model {model} size: {model_size['size_in_gb']}")
print(len(final_models))
print(final_models)

In [None]:
model_info = AutoModelForSeq2SeqLM.from_pretrained('lachkarsalim/LatinDarija_English-v1', return_dict=True)
model_size = model_info.num_parameters() / 1e6