## cmd-line argument
python -m llama_cpp.server --host 0.0.0.0 --model Meta-Llama-3.1-8B-Instruct.Q8_0.gguf --n_ctx 30000 --n_gpu_layers 30

In [1]:
from openai import OpenAI
from collections import defaultdict as dd
import pandas as pd
# Point to the server
client = OpenAI(base_url="http://localhost:8000/v1", api_key="cltl")


In [2]:
def query_LLM(model_client, prompt, temp=0.6):
    history = [
        {"role": "system", "content": prompt},
    ]
    completion = model_client.chat.completions.create(
        model="local-model", # this field is currently unused
        messages=history,
        temperature=temp,
        stream=False,
    )

    return completion.choices[0].message.content

In [3]:
path_to_corpus = '.\Corpora\corpus_with_MT_translations.pkl'
corpus_df = pd.read_pickle(path_to_corpus)
path_to_syntethic = '.\Corpora\sythetic_with_MT_translations.pkl'
synthetic_df = pd.read_pickle(path_to_syntethic)

In [4]:
corpus_df['Llama_Evaluation'] = ''

index = 0
for row in corpus_df.iterrows():
    dutch_sent = str(corpus_df["Dutch_Sentence"][index])
    reference_sentence = str(corpus_df['English_Sentence'][index])
    translated_sentence = str(corpus_df['Translated_Dutch'][index])
    prompt = f"""
[System message] You are a professional Dutch-English translator.
[Instruction message] You are a professional Dutch-English translator. Your task is to identify translation errors from a pair of the source Dutch sentence and the target English sentence. 
Please identify a maximum of 5 errors, assigning an error type for each with a severity scale from 1 (least sever) to 5 (most severe) using
the MQM annotation scheme. Please consider the following criteria for identifying errors:
Accuracy: when the target translation does not accurately represent the source 
Omission: when the target translation is missing content present in the source text, identify what was omitted.
Fluency: issues with punctuation, spelling, grammar, register and inconsistency.
Style: when the translation is grammatically correct but uses unnatural or awkward language.
Terminology: inappropriate or inconsistent use of terms
Locale convention: issues with formatting

Please provide the output in JSON format including the following keys for each error: 
['error type'(value:Accuracy, Omission, Fluency, Style, Terminology, Locale convention), 'error span index'(start and end index of the marked sentence in the target sentence) ,'marked text'(the identified error in the target sentence,'severity'(1-5))]
Do this for {dutch_sent} as source and {translated_sentence} as target.
Not all sentence necessarily contain errors.
[Requirement] output shall only consist of text in the prescribed JSON format
"""
    query_answer = query_LLM(client,prompt,temp=0.1)
    print(query_answer)
    corpus_df.loc[index,'Llama_Evaluation'] = query_answer
    index+=1


{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 3},
      "marked_text": "Many African women go home with a fistle",
      "severity": 5
    },
    {
      "error_type": "Terminology",
      "error_span_index": {"start": 4, "end": 11},
      "marked_text": "and without a baby",
      "severity": 1
    }
  ]
}
{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 7},
      "marked_text": "Do not consider us as",
      "severity": 3
    },
    {
      "error_type": "Terminology",
      "error_span_index": {"start": 8, "end": 24},
      "marked_text": "de moppentrommel van de unief.",
      "severity": 5
    }
  ]
}
{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 1},
      "marked_text": "There is a patient who has been helped with our techniques.",
      "severity": 2
    },
    {
      "error_type": "Omission",
      "error_span_index": {"

In [5]:
corpus_df

Unnamed: 0,Dutch_Sentence,English_Sentence,Dutch Sentence Length,English Sentence Length,Translated_Dutch,BLEU_score,Llama_Evaluation
115,Als iedereen een koffiepauze zou houden in dit...,If everybody had a coffee break in this sort o...,17,19,If everyone had a coffee break in this kind of...,32.912563,"{\n ""errors"": [\n {\n ""error_type"": ""..."
116,Sommigen waren zo wanhopig dat ze hun kleren u...,Ive seen people so desperate that they take so...,16,24,Some were so desperate that they took off thei...,14.551430,"{\n ""errors"": [\n {\n ""error_type"": ""..."
117,Ik heb heel veel mensen naar het ziekenhuis ge...,"Ive taken so many people to hospital, but we c...",16,17,"I've taken a lot of people to the hospital, bu...",11.451997,"{\n ""errors"": [\n {\n ""error_type"": ""..."
118,Het is belangrijk dat er rekening gehouden wor...,It is important that the frequency of this phe...,32,37,It is important that this phenomenon is taken ...,22.497813,"[\n {\n ""error type"": ""Accuracy"",\n ""er..."
119,Daarenboven evalueren vrouwen het dansen van s...,"In addition, women rate dances by symmetrical ...",32,27,"In addition, women are evaluating the dancing ...",9.571600,"[\n {\n ""error type"": ""Accuracy"",\n ""er..."
...,...,...,...,...,...,...,...
310,Ik werk er mee aan een Europees actieplan voor...,I am working on a European action plan for tel...,20,19,I am working on a European action plan for tel...,68.880746,"{\n ""errors"": [\n {\n ""error_type"": ""..."
311,Nederland is in de eerste plaats een groot lab...,"The Netherlands is, first and foremost, a huge...",12,14,The Netherlands is first and foremost a great ...,36.086385,"{\n ""errors"": [\n {\n ""error_type"": ""..."
312,Wij zijn als het ware het oog en oor van Vlaan...,In a sense were the eyes and ears of Flanders ...,14,11,"We are, as it were, the eye-to-eye of Vlaander...",7.768563,"{\n ""errors"": [\n {\n ""error_type"": ""..."
313,De Nederlandse staatsveiligheid kon zich niet ...,The Dutch State Security Service couldnt belie...,15,16,The Dutch state security couldn't imagine that...,38.989389,"{\n ""errors"": [\n {\n ""error_type"": ""..."


In [7]:
synthetic_df['Llama_Evaluation'] = ''

index = 315
for row in synthetic_df.iterrows():
    dutch_sent = str(synthetic_df["Dutch_Sentence"][index])
    reference_sentence = str(synthetic_df['English_Sentence'][index])
    translated_sentence = str(synthetic_df['Translated_Dutch'][index])
    prompt = f"""
[System message] You are a professional Dutch-English translator.
[Instruction message] You are a professional Dutch-English translator. Your task is to identify translation errors from a pair of the source Dutch sentence and the target English sentence. 
Please identify a maximum of 5 errors, assigning an error type for each with a severity scale from 1 (least sever) to 5 (most severe) using
the MQM annotation scheme. Please consider the following criteria for identifying errors:
Accuracy: when the target translation does not accurately represent the source 
Omission: when the target translation is missing content present in the source text, identify what was omitted.
Fluency: issues with punctuation, spelling, grammar, register and inconsistency.
Style: when the translation is grammatically correct but uses unnatural or awkward language.
Terminology: inappropriate or inconsistent use of terms
Locale convention: issues with formatting

Please provide the output in JSON format including the following keys for each error: 
['error type'(value:Accuracy, Omission, Fluency, Style, Terminology, Locale convention), 'error span index'(start and end index of the marked sentence in the target sentence) ,'marked text'(the identified error in the target sentence,'severity'(1-5))]
Do this for {dutch_sent} as source and {translated_sentence} as target.
Not all sentence necessarily contain errors.
[Requirement] output shall only consist of text in the prescribed JSON format
"""
    query_answer = query_LLM(client,prompt,temp=0.1)
    print(query_answer)
    synthetic_df.loc[index,'Llama_Evaluation'] = query_answer
    index+=1

{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 4},
      "marked_text": "One call a day keeps the arts busy.",
      "severity": 5
    },
    {
      "error_type": "Terminology",
      "error_span_index": {"start": 7, "end": 10},
      "marked_text": "the arts",
      "severity": 3
    }
  ]
}
{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 6},
      "marked_text": "Money is the root of all evil.",
      "severity": 5
    },
    {
      "error_type": "Terminology",
      "error_span_index": {"start": 7, "end": 13},
      "marked_text": "the root of all evil",
      "severity": 3
    }
  ]
}
{
  "errors": [
    {
      "error_type": "Accuracy",
      "error_span_index": {"start": 0, "end": 1},
      "marked_text": "foolish",
      "severity": 3
    },
    {
      "error_type": "Terminology",
      "error_span_index": {"start": 4, "end": 11},
      "marked_text": "bad consultant",
   

In [8]:
corpus_df.to_pickle('.\Corpora\corpus_with_MT_translations_with_evaluation.pkl')
synthetic_df.to_pickle('.\Corpora\sythetic_with_MT_translations_with_evaluation.pkl')