NLLB

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def translate(source_text, source_lang, target_lang):

    # Kinyarwanda Tokenizer
    tokenizer.src_lang = source_lang

    # Tokenize the input text
    inputs = tokenizer(source_text, return_tensors="pt")

    # Generate the translation according to target language specified
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang], max_length=30
    )

    # Decode the translated tokens for translated text
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

    return translated_text

In [3]:
translate("Didn't get here being careful", "en", "zho_Hans")

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


'不过,我没有到这里来,要小心.'

MQM Score

In [2]:
from huggingface_hub import snapshot_download
# pip install unbabel-comet is the one for the below line, otherwise it's pip install comet or pip install comet-ml I don't know
from comet import download_model, load_from_checkpoint
# Download the model
token = "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"
mqm_model_path = snapshot_download(repo_id="Unbabel/wmt23-cometkiwi-da-xl", use_auth_token=token)



# Load the model from the checkpoint
mqm_model = load_from_checkpoint(mqm_model_path + '\checkpoints\model.ckpt')


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]



In [10]:
data = [{
    "src": "Didn't get here being careful",
    "mt":  "不过,我没有到这里来,要小心.",
    }]

mqm_model.predict(data, batch_size=1, gpus=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:08<00:00,  8.22s/it]


Prediction([('scores', [0.15622669458389282]),
            ('system_score', 0.15622669458389282)])

Fine-Grained Error Detection

In [2]:
from comet import download_model, load_from_checkpoint


ImportError: cannot import name 'download_model' from 'comet' (c:\Users\mekae\Desktop\CS\ML-DL\Projects\Lee Lab\Error-in-Translations\error-in-translations\Lib\site-packages\comet\__init__.py)

In [12]:
from comet import download_model, load_from_checkpoint

FGED_model_path = download_model("Unbabel/XCOMET-XL")
FGED_model = load_from_checkpoint(FGED_model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Encoder model frozen.
C:\Users\mekae\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\pytorch_lightning\core\saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [19]:
data = [{
    # "src": "Didn't get here being careful",
    # "mt":  "不过,我没有到这里来,要小心.",
    "src": "Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip",
    "mt":  "史丹佛大學醫學院的學生週一宣布發明了一種新的診斷工具，可以根據小型印刷晶片的類型對細胞進行分類",
    }]

FGED_model_output = FGED_model.predict(data, batch_size=1, gpus=1)
# Segment-level scores
print (FGED_model_output.scores)

# System-level score
print (FGED_model_output.system_score)

# Score explanation (error spans)
print (FGED_model_output.metadata.error_spans)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:19<00:00, 19.62s/it]


[0.9770883321762085]
0.9770883321762085
[[{'text': '小型印刷晶片', 'confidence': 0.37087565660476685, 'severity': 'minor', 'start': 31, 'end': 37}]]


In [18]:
import json

with open('sample_error_span.json', encoding='utf-8') as f:
        data = json.load(f)

data

{'errors': [{'original_text': 'Учените',
   'translated_text': 'Students',
   'correct_text': 'Scientists',
   'start_index_orig': 0,
   'end_index_orig': 7,
   'start_index_translation': 0,
   'end_index_translation': 7,
   'error_type': 'Incorrect Subject'},
  {'original_text': 'който може да сортира клетките по тип: малък печатен чип',
   'translated_text': '',
   'correct_text': 'that can sort cells by type: small printed chip',
   'start_index_orig': 75,
   'end_index_orig': 131,
   'start_index_translation': 86,
   'end_index_translation': 108,
   'error_type': 'Omission'},
  {'original_text': 'изобретяването на нов диагностичен инструмент',
   'translated_text': 'the invention of a new diagnostic tool that can sort cells by a type of small printed',
   'correct_text': 'the invention of a new diagnostic tool',
   'start_index_orig': 43,
   'end_index_orig': 75,
   'start_index_translation': 51,
   'end_index_translation': 108,
   'error_type': 'Incomplete Sentence'}]}

Error Span Scoring

Highlighting

In [39]:
translation = "Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip"

spans = [
{
    "errors": [
    {
        "original_text": "Учените",
        "translated_text": "Students",
        "correct_text": "Scientists",
        "start_index_orig": 0,
        "end_index_orig": 7,
        "start_index_translation": 0,
        "end_index_translation": 7,
        "error_type": "Incorrect Subject"
    },
    {
        "original_text": "изобретяването на нов диагностичен инструмент",
        "translated_text": "the invention of a new diagnostic tool that can sort cells by a type of small printed",
        "correct_text": "the invention of a new diagnostic tool",
        "start_index_orig": 43,
        "end_index_orig": 75,
        "start_index_translation": 51,
        "end_index_translation": 108,
        "error_type": "Incomplete Sentence"
    },
    # {
    #     "original_text": "който може да сортира клетките по тип: малък печатен чип",
    #     "translated_text": "",
    #     "correct_text": "that can sort cells by type: small printed chip",
    #     "start_index_orig": 75,
    #     "end_index_orig": 131,
    #     "start_index_translation": 52,
    #     "end_index_translation": 106,
    #     "error_type": "Omission"
    # },
    {
        "original_text": "който може да сортира клетките по тип: малък печатен чип",
        "translated_text": "",
        "correct_text": "that can sort cells by type: small printed chip",
        "start_index_orig": 75,
        "end_index_orig": 131,
        "start_index_translation": 86,
        "end_index_translation": 108,
        "error_type": "Omission"
    }
    ]
}
]

In [40]:
final = "<span>" + translation + "</span>"
final = translation
start = 51
end = 108

colors = {
    "Incorrect Subject": "#00A0F0",
    "Omission": "#59c00aba",
    "Incomplete Sentence": "#D3365A",
  }

# zIndex = len(spans[0]["errors"])
zIndex = 0
offset = 0

for error in spans[0]["errors"]:
    start = error["start_index_translation"]
    end = error["end_index_translation"]
    color = colors[error["error_type"]]



    Ltag = "<span class='highlight' style='background-color: " + color + "; padding: " + str(zIndex) + "vh 0vw " + str(zIndex) + "vh 0vw; zIndex: " + str(zIndex) + "'>"
    Rtag = "</span>"
    
    # Must go left to right if we use this ordering of offset

    # split = final.split(translation[start + offset:end + offset],1)
    # final = split[0] + Ltag + translation[start:end] + Rtag + split[1]
    
    # split = final.split("")


    final = final[:start + offset] + Ltag + final[start + offset:end + offset] + Rtag + final[end + offset:]

    offset += len(Ltag) + len(Rtag)

    zIndex += 1

    print("<span>" + final + "</span>")


<span><span class='highlight' style='background-color: #00A0F0; padding: 0vh 0vw 0vh 0vw; zIndex: 0'>Student</span>s from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip</span>
<span><span class='highlight' style='background-color: #00A0F0; padding: 0vh 0vw 0vh 0vw; zIndex: 0'>Student</span>s from Stanford University Medical School an<span class='highlight' style='background-color: #D3365A; padding: 1vh 0vw 1vh 0vw; zIndex: 1'>nounced Monday the invention of a new diagnostic tool tha</span>t can sort cells by type of small printed chip</span>
<span><span class='highlight' style='background-color: #00A0F0; padding: 0vh 0vw 0vh 0vw; zIndex: 0'>Student</span>s from Stanford University Medical School an<span class='highlight' style='background-color: #D3365A; padding: 1vh 0vw 1vh 0vw; zIndex: 1'>nounced Monday the invention of a new diag<span class='highlight' style='background-color: #59c00aba; pa

In [37]:
translation[2:7]

t = translation[:4] + "AHHHH" + translation[4:]
t
offset = len("AHHHH")
translation[2:7]
t[2 + offset : 7 + offset]

'HHent'

In [45]:

def errorSpanHighlighter(translation, spans, color_mappings):
    final = translation

    zIndex = 0
    offset = 0

    for error in spans[0]["errors"]:
        # init
        start = error["start_index_translation"]
        end = error["end_index_translation"]
        color = color_mappings[error["error_type"]]

        # tags
        Ltag = "<span class='highlight' style='background-color: " + color + "; padding: " + str(zIndex) + "vh 0vw " + str(zIndex) + "vh 0vw; zIndex: " + str(zIndex) + "'>"
        Rtag = "</span>"
        
        # Algo
        # Must go left to right if we use this ordering of offset
        final = final[:start + offset] + Ltag + final[start + offset:end + offset] + Rtag + final[end + offset:]
        offset += len(Ltag) + len(Rtag)
        zIndex += 1

    return "<span>" + final + "</span>"



colors = {
    "Incorrect Subject": "#00A0F0",
    "Omission": "#59c00aba",
    "Incomplete Sentence": "#D3365A",
}

errorSpanHighlighter(translation, spans, colors)

"<span><span class='highlight' style='background-color: #00A0F0; padding: 0vh 0vw 0vh 0vw; zIndex: 0'>Student</span>s from Stanford University Medical School an<span class='highlight' style='background-color: #D3365A; padding: 1vh 0vw 1vh 0vw; zIndex: 1'>nounced Monday the invention of a new diag<span class='highlight' style='background-color: #59c00aba; padding: 2vh 0vw 2vh 0vw; zIndex: 2'>nostic tool tha</span></span>t can sort cells by type of small printed chip</span>"