NLLB

In [31]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [32]:
def translate(source_text, source_lang, target_lang):

    # Kinyarwanda Tokenizer
    tokenizer.src_lang = source_lang

    # Tokenize the input text
    inputs = tokenizer(source_text, return_tensors="pt")

    # Generate the translation according to target language specified
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang], max_length=30
    )

    # Decode the translated tokens for translated text
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

    return translated_text

In [3]:
translate("Didn't get here being careful", "en", "zho_Hans")

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


'不过,我没有到这里来,要小心.'

In [35]:
translate("Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip",  "en", "zho_Hans")

'斯坦福大学医学院的学生周一宣布发明了一种新的诊断工具,可以根据小型印花芯片的'

MQM Score

In [6]:
from huggingface_hub import snapshot_download
# pip install unbabel-comet is the one for the below line, otherwise it's pip install comet or pip install comet-ml I don't know
from comet import download_model, load_from_checkpoint
# Download the model
token = "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"
mqm_model_path = snapshot_download(repo_id="Unbabel/wmt23-cometkiwi-da-xl", use_auth_token=token)



# Load the model from the checkpoint
mqm_model = load_from_checkpoint(mqm_model_path + '\checkpoints\model.ckpt')


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Encoder model frozen.


In [63]:
data = [{
    "src": "Didn't get here being careful",
    "mt":  "不过,我没有到这里来,要小心.",
    }]

mqm_model.predict(data, batch_size=1, gpus=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


Prediction([('scores', [0.15611198544502258]),
            ('system_score', 0.15611198544502258)])

Fine-Grained Error Detection

In [None]:
data = [{
    "src": "Didn't get here being careful",
    "mt":  "不过,我没有到这里来,要小心.",
    }]

mqm_model.predict(data, batch_size=1, gpus=1)

In [1]:
from comet import download_model, load_from_checkpoint
# This line comes with quite the issues, we don't know the full solution yet but do 
# pip install "unbabel-comet>=2.2.0"

In [9]:
from comet import download_model, load_from_checkpoint

FGED_model_path = download_model("Unbabel/XCOMET-XL")
FGED_model = load_from_checkpoint(FGED_model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Encoder model frozen.
C:\Users\mekae\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\pytorch_lightning\core\saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [3]:
import torch
import os
import json

# Set the precision to 'medium' for a balance between performance and precision
torch.set_float32_matmul_precision('medium')

from dotenv import load_dotenv
load_dotenv()

# Set up OpenAI Client
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
from openai import OpenAI
client = OpenAI(api_key= OPENAI_API_KEY)

In [64]:
data = [{
    "src": "Talking about risks, I take those. Didn't get here being careful",
    # "mt":  "談到風險，我承擔這些風險。沒到這裡小心點",
    "mt":  "谈到风险,我会承担这些风险.我没有来这里是谨慎的.",

    # "src": "Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip",
    # "mt":  "史丹佛大學醫學院的學生週一宣布發明了一種新的診斷工具，可以根據小型印刷晶片的類型對細胞進行分類",
    
    # "src": "23 should be all on my back",
    # "mt":  "23岁应该是我背后的.",
    
    
    # "src": "Well rounded searching for the right outlet",
    # "mt":  "即使一个男人倒下,我也会变得更强大. 圆满寻找合适的输出口.",

    # "src": "The government is investing in infrastructure projects to boost the economy and create jobs",
    # "mt":  "政府正在投資基礎設施項目，以促進經濟增長並創造就業機會",
    }]


FGED_model.eval()
with torch.no_grad(): 
    FGED_model_output = FGED_model.predict(data, batch_size=1, gpus=1)

print(FGED_model_output)

# Segment-level scores
print (FGED_model_output.scores)

# System-level score
print (FGED_model_output.system_score)

# Score explanation (error spans)
print (FGED_model_output.metadata.error_spans)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]


Prediction([('scores', [0.4809541404247284]), ('system_score', 0.4809541404247284), ('metadata', Prediction([('src_scores', [0.33455660939216614]), ('mqm_scores', [1.0]), ('error_spans', [[]])]))])
[0.4809541404247284]
0.4809541404247284
[[]]


In [85]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo-16k",
  messages=[
    {
      "role": "system",
      "content": [
        {
          "text": "You are an expert in multilingual translations. Given MQM scores, corrected text, error spans, you will determine a classification for the error out of ONLY one of the following: Addition of Text ,Negation Errors ,Mask In-filling ,Named Entity (NE) Errors ,Number (NUM) Errors ,Hallucinations\n\nThen turn that into this format by correctly indicating the start and end corresponding indices\n\n{\r\n    \"errors\": [\r\n      {\r\n        \"original_text\": \"Учените\",\r\n        \"translated_text\": \"Students\",\r\n        \"correct_text\": \"Scientists\",\r\n        \"start_index_orig\": 0,\n\r\n        \"end_index_orig\": 7,\r\n        \"start_index_translation\": 0,\r\n        \"end_index_translation\": 7,\r\n        \"error_type\": \"Incorrect Subject\"\r\n      } \n    ]\r\n  }\r\n\r\n",
          "type": "text"
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": str(data[0]) + "\n\n" + str(FGED_model_output.system_score) + " " + str(FGED_model_output.metadata.error_spans)
        }
      ]
    },
  ],
  temperature=1,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

json_object = json.loads(response.choices[0].message.content)
print(json.dumps(json_object, indent=4))


with open('response.json', 'w') as f:
    json.dump(json_object, f, indent=4)

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{
    "errors": [
        {
            "original_text": "\u8c28\u614e\u7684.",
            "translated_text": "careful.",
            "correct_text": "careful",
            "start_index_orig": 1,
            "end_index_orig": 5,
            "start_index_translation": 6,
            "end_index_translation": 13,
            "error_type": "Mask In-filling"
        }
    ]
}


In [16]:
color_mappings = {
    "Addition of Text": "#FF5733",
    "Negation Errors": "#00A0F0",
    "Mask In-filling": "#59c00a",
    "Named Entity (NE) Errors": "#D3365A",
    "Number (NUM) Errors": "#8B4513",
    "Hallucinations": "#800080",
    "No Error": "#2f3472"
}

# final = translation
final = "Cause they don't want to see you win dawg"

zIndex = 0
offset = 0

for error in spans["errors"]:
    # init
    start = error["start_index_translation"]
    end = error["end_index_translation"]
    if error["error_type"] in color_mappings: 
        color = color_mappings[error["error_type"]]
    else:
        color = "#FFFFFF"

    id = zIndex

    # tags
    Ltag = "<span class='highlight' id='highlight-" + str(id) + "' style='background-color: " + color + "; padding: " + str(zIndex) + "vh 0vw " + str(zIndex) + "vh 0vw; zIndex: " + str(zIndex) + "'>"
    Rtag = "</span>"
    
    # Algo
    # Must go left to right if we use this ordering of offset
    final = final[:start + offset] + Ltag + final[start + offset:end + offset] + Rtag + final[end + offset:]
    offset += len(Ltag) + len(Rtag)
    zIndex += 1

print("<span>" + final + "</span>")

<span>Cause the<span class='highlight' id='highlight-0' style='background-color: #800080; padding: 0vh 0vw 0vh 0vw; zIndex: 0'>y do</span>n't want to see you win dawg</span>


In [10]:

# Example data
data = [
    {
        # "src": "Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip",
        # "mt":  "史丹佛大學醫學院的學生週一宣布發明了一種新的診斷工具，可以根據小型印刷晶片的類型對細胞進行分類",
        
        "src": "Cause they don't want to see you win dawg",
        "mt":  "因为他们不想看到你赢得比赛.",
    },
    # Add more data pairs as needed
]

# Run prediction
predictions = FGED_model.predict(data, batch_size=8, gpus=1)
predictions
# # Process and print detailed output
# for prediction in predictions:
#     print("Quality Score:", prediction['score'])
#     for error in prediction['error_spans']:
#         print(f"Text: {error['text']}, Confidence: {error['confidence']}, Severity: {error['severity']}, Type: {error['type']}")


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]


Prediction([('scores', [0.8951457142829895]),
            ('system_score', 0.8951457142829895),
            ('metadata',
             Prediction([('src_scores', [0.921981692314148]),
                         ('mqm_scores', [0.800000011920929]),
                         ('error_spans',
                          [[{'text': '赢得比赛',
                             'confidence': 0.41127336025238037,
                             'severity': 'major',
                             'start': 9,
                             'end': 13}]])]))])

In [23]:
# for prediction in predictions:
    # print(prediction)

predictions

Prediction([('scores', [0.9770506620407104]),
            ('system_score', 0.9770506620407104),
            ('metadata',
             Prediction([('src_scores', [0.9818598628044128]),
                         ('mqm_scores', [0.9599999785423279]),
                         ('error_spans',
                          [[{'text': '小型印刷晶片',
                             'confidence': 0.3709107041358948,
                             'severity': 'minor',
                             'start': 31,
                             'end': 37}]])]))])

In [16]:
error_span_mappings = {
    "lexical_errors": {
        "Addition of Text": "Extra content added that was not present in the source text.",
        "Negation Errors": "Incorrect handling of negations, changing the meaning of the sentence.",
        "Mask In-filling": "Issues where masked parts are filled incorrectly.",
        "Named Entity (NE) Errors": "Incorrect translation or handling of named entities like names, places, etc.",
        "Number (NUM) Errors": "Errors related to numbers, such as incorrect translation of quantities.",
        "Hallucinations": "Pathological translations where content is detached from the source."
    },
    "error_severity": {
        "OK": "Correct translation with no issues.",
        "MIN": "Small errors that might affect fluency or slightly alter the meaning.",
        "MAJ": "Significant errors that affect the meaning or readability of the translation.",
        "CRIT": "Severe errors that render the translation incorrect or highly misleading."
    }
}

In [12]:
import json

with open('sample_error_span.json', encoding='utf-8') as f:
        data = json.load(f)

data

{'errors': [{'original_text': 'Учените',
   'translated_text': 'Students',
   'correct_text': 'Scientists',
   'start_index_orig': 0,
   'end_index_orig': 7,
   'start_index_translation': 0,
   'end_index_translation': 7,
   'error_type': 'Incorrect Subject'},
  {'original_text': 'който може да сортира клетките по тип: малък печатен чип',
   'translated_text': '',
   'correct_text': 'that can sort cells by type: small printed chip',
   'start_index_orig': 75,
   'end_index_orig': 131,
   'start_index_translation': 86,
   'end_index_translation': 108,
   'error_type': 'Omission'},
  {'original_text': 'изобретяването на нов диагностичен инструмент',
   'translated_text': 'the invention of a new diagnostic tool that can sort cells by a type of small printed',
   'correct_text': 'the invention of a new diagnostic tool',
   'start_index_orig': 43,
   'end_index_orig': 75,
   'start_index_translation': 51,
   'end_index_translation': 108,
   'error_type': 'Incomplete Sentence'}]}

Error Span Scoring

Highlighting

In [3]:
translation = "Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip"

spans = [
{
    "errors": [
    {
        "original_text": "Учените",
        "translated_text": "Students",
        "correct_text": "Scientists",
        "start_index_orig": 0,
        "end_index_orig": 7,
        "start_index_translation": 0,
        "end_index_translation": 7,
        "error_type": "Incorrect Subject"
    },
    {
        "original_text": "изобретяването на нов диагностичен инструмент",
        "translated_text": "the invention of a new diagnostic tool that can sort cells by a type of small printed",
        "correct_text": "the invention of a new diagnostic tool",
        "start_index_orig": 43,
        "end_index_orig": 75,
        "start_index_translation": 51,
        "end_index_translation": 108,
        "error_type": "Incomplete Sentence"
    },
    # {
    #     "original_text": "който може да сортира клетките по тип: малък печатен чип",
    #     "translated_text": "",
    #     "correct_text": "that can sort cells by type: small printed chip",
    #     "start_index_orig": 75,
    #     "end_index_orig": 131,
    #     "start_index_translation": 52,
    #     "end_index_translation": 106,
    #     "error_type": "Omission"
    # },
    {
        "original_text": "който може да сортира клетките по тип: малък печатен чип",
        "translated_text": "",
        "correct_text": "that can sort cells by type: small printed chip",
        "start_index_orig": 75,
        "end_index_orig": 131,
        "start_index_translation": 86,
        "end_index_translation": 108,
        "error_type": "Omission"
    }
    ]
}
]

In [15]:
spans = [{
    "errors": [
        {
            "original_text": "\u8ac7\u5230\u98a8\u96aa\uff0c\u6211\u627f\u64d4\u9019\u4e9b\u98a8\u96aa\u3002",
            "translated_text": "Talking about risks, I take those.",
            "correct_text": "Talking about risks, I take these risks.",
            "start_index_orig": 0,
            "end_index_orig": 20,
            "start_index_translation": 0,
            "end_index_translation": 20,
            "error_type": "Addition of Text"
        }
    ]
}]


spans = {
    'errors': [
        {
            'original_text': '赢得比赛', 
            'translated_text': 'win the game', 
            'correct_text': 'win the game',
            'start_index_orig': 9,
            'end_index_orig': 13,
            'start_index_translation': 9,
            'end_index_translation': 13,
            'error_type': 'Hallucinations'
        }
    ]
}

In [40]:
translation = translate("Students from Stanford University Medical School announced Monday the invention of a new diagnostic tool that can sort cells by type of small printed chip",  "en", "zho_Hans")
spans = json_object

In [69]:
spans

for error in spans:
    print(spans[error]) 

[{'original_text': '小型印刷晶片', 'translated_text': 'small printed chip', 'correct_text': 'small printed chip', 'start_index_orig': 31, 'end_index_orig': 37, 'start_index_translation': 31, 'end_index_translation': 37, 'error_type': 'No Error'}]


In [60]:
FGED_model_path

'C:\\Users\\mekae\\.cache\\huggingface\\hub\\models--Unbabel--XCOMET-XL\\snapshots\\baa17625e541fe87c4c0010616e35eab12c864f7\\checkpoints\\model.ckpt'

In [1]:
translation[2:7]

t = translation[:4] + "AHHHH" + translation[4:]
t
offset = len("AHHHH")
translation[2:7]
t[2 + offset : 7 + offset]

NameError: name 'translation' is not defined

In [4]:
def errorSpanHighlighter(translation, spans, color_mappings):
    final = translation

    zIndex = 0
    offset = 0

    for error in spans["errors"]:
        # init
        start = error["start_index_translation"]
        end = error["end_index_translation"]
        color = color_mappings[error["error_type"]]

        # tags
        Ltag = "<span class='highlight' style='background-color: " + color + "; padding: " + str(zIndex) + "vh 0vw " + str(zIndex) + "vh 0vw; zIndex: " + str(zIndex) + "'>"
        Rtag = "</span>"
        
        # Algo
        # Must go left to right if we use this ordering of offset
        final = final[:start + offset] + Ltag + final[start + offset:end + offset] + Rtag + final[end + offset:]
        offset += len(Ltag) + len(Rtag)
        zIndex += 1

    return "<span>" + final + "</span>"


colors = {
    "Addition of Text": "#FF5733",
    "Negation Errors": "#00A0F0",
    "Mask In-filling": "#59c00a",
    "Named Entity (NE) Errors": "#D3365A",
    "Number (NUM) Errors": "#8B4513",
    "Hallucinations": "#800080",
    "No Error": "#FFFFFF"
}

errorSpanHighlighter(translation, spans, colors)

# spans

TypeError: list indices must be integers or slices, not str

In [68]:

def errorSpanHighlighter(translation, spans, color_mappings):
    final = translation

    zIndex = 0
    offset = 0

    for error in spans[0]["errors"]:
        # init
        start = error["start_index_translation"]
        end = error["end_index_translation"]
        color = color_mappings[error["error_type"]]

        # tags
        Ltag = "<span class='highlight' style='background-color: " + color + "; padding: " + str(zIndex) + "vh 0vw " + str(zIndex) + "vh 0vw; zIndex: " + str(zIndex) + "'>"
        Rtag = "</span>"
        
        # Algo
        # Must go left to right if we use this ordering of offset
        final = final[:start + offset] + Ltag + final[start + offset:end + offset] + Rtag + final[end + offset:]
        offset += len(Ltag) + len(Rtag)
        zIndex += 1

    return "<span>" + final + "</span>"



colors = {
    "Incorrect Subject": "#00A0F0",
    "Omission": "#59c00aba",
    "Incomplete Sentence": "#D3365A",
}

errorSpanHighlighter(translation, spans, colors)

KeyError: 0

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

from openai import OpenAI
client = OpenAI(api_key= OPENAI_API_KEY)

In [83]:
# Feeding the model (OpenAI API) the original translation meta descriptors and prompting for a new/rephrased/alternate translation
def rephrase(previous_translations, original_source_text, source_language, target_language):

  # Construct the prompt for rephrased/alternate translation
  prompt = f'''Please provide an alternative translation to and rephrase: "{previous_translations}"
          The original text was {original_source_text}.
          The source language is {source_language} and the target language is {target_language}.'''


  response = client.chat.completions.create(
    # model="gpt-4o-2024-05-13",
    model="gpt-3.5-turbo",
    messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": f"Please provide an alternative translation to and rephrase: \"{previous_translations}\"\r\n          The original text was {original_source_text}.\r\n          The source language is {source_language} and the target language is {target_language}. Give only the translation"
        }
      ]
    }
  ],
    temperature=1,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response.choices[0].message.content

# Get alternative translation for demo text
previous_translations = [
  "The weather is great today, let's take a walk in the park together.",
  "Today's weather is wonderful, shall we go for a walk in the park?",
  "It's a beautiful day today, how about we go for a stroll in the park?",
]

# Probably want to use the Translation memory/storage for avoiding repeat translations and get unique results each time

In [9]:
original_source_text = "今天的天气非常好，我们一起去公园散步吧。"
source_language = "zh"
target_language = "en"


alternative_translation = rephrase(previous_translations, original_source_text, source_language, target_language)
previous_translations.append(alternative_translation)
print(alternative_translation)

"The weather is very nice today, let's go for a walk in the park together."


In [10]:
for x in previous_translations:
    print(x)

The weather is great today, let's take a walk in the park together.
Today's weather is wonderful, shall we go for a walk in the park?
It's a beautiful day today, how about we go for a stroll in the park?
The weather is fantastic today, let's go for a walk in the park together.
"The weather is very nice today, let's go for a walk in the park together."


In [12]:
# Few-shot context changing via user input
def edit_context(context, original_text, source_language, target_language, few_shot_examples):

  response = client.chat.completions.create(
    # model="gpt-4o-2024-05-13",
    model="gpt-3.5-turbo",
    messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": f'''Translate "{original_text}" from the {source_language} language, into the {target_language} language
                      using the following context style: "{context}".
                      Here are some examples of the style being used: {few_shot_examples}
                      '''
        }
      ]
    }
  ],
    temperature=1,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response.choices[0].message.content


print("CHINESE TO ENGLISH EXAMPLE")

# Poetic Example
examples = '''"Light speeds away, in a blink it will fly."
              "Stars reach from afar, with light from eons bold."
           '''

print(edit_context("A poetic two line rhyming style", "光纤照上去变成黑光纤了", "zh", "en", examples), "\n\n")

# Confused and Inquisitive Example
examples = '''"How does it work so well when it's so unclear?"
              "Why did it begin so suddenly, what triggered it?"
            '''

print(edit_context("A confused and inquisitive style", "光纤照上去变成黑光纤了", "zh", "en", examples), "\n\n")



# English to Chinese Example
print("ENGLISH TO CHINESE EXAMPLE")

# Poetic Example
examples = '''"光逝如飞，瞬息即逝。"
              "星光遥来，古光犹豪。"
           '''

print(edit_context("A poetic two line rhyming style", "Lose track of time I'm bugging", "en", "zh", examples), "\n\n")

# Confused and Inquisitive Example
examples = '''"当它如此不清楚时，它如何运作得这么好？"
              "为什么它开始得这么突然，是什么触发了它？"
            '''

print(edit_context("A confused and inquisitive style", "Lose track of time I'm bugging", "en", "zh", examples))


CHINESE TO ENGLISH EXAMPLE
Fiber optic cable, in the dark it fades away, lost in night's play. 


Why did the fiber optics turn black when the light hit them? 


ENGLISH TO CHINESE EXAMPLE
时光荏苒，忘我迷离。 


我完全搞不清时间，我烦死了。
