In [6]:
#imports
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch.optim import AdamW
import sacrebleu
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [7]:
#loading the data set
df = pd.read_csv('train1.csv')
print(df.head())

#use 'Tagged Sentence' for training input and 'English Translation' as the target otherwise replace Tagged Sentence with Tagalog if you want to train it without the tagged pronouns or uncomment the line below
#tagalog_sentences = df['Tagalog']

tagalog_sentences = df['Tagged Sentence']
english_translations = df['English Translation']


                           Tagalog Sentence  \
0                        Siya ay nagbabasa.   
1                        Sila ay nagbabasa.   
2  Si Maria ay nagluluto. Siya ay kumakain.   
3       Si Juan ay tumatakbo. Siya ay uhaw.   
4                       Siya ay nagkakamot.   

                                     Tagged Sentence  \
0  Siya [singular, third-person, gender-neutral] ...   
1          Sila [plural, third-person] ay nagbabasa.   
2  Si Maria ay nagluluto. Siya [singular, third-p...   
3  Si Juan ay tumatakbo. Siya [singular, third-pe...   
4  Siya [singular, third-person, gender-neutral] ...   

                English Translation  \
0                 They are reading.   
1                 They are reading.   
2  Maria is cooking. She is eating.   
3   Juan is running. He is thirsty.   
4                 They are itching.   

                           Pronoun Annotation  
0  Pronoun = "they," singular, gender-neutral  
1                    Pronoun = "they," plural  
2

In [8]:

#loading BART tokenizer and tokenizing inputs(tagalog_sentences) and targets (english_translations)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
inputs = tokenizer(list(tagalog_sentences), return_tensors='pt', padding=True, truncation=True, max_length=512)
targets = tokenizer(list(english_translations), return_tensors='pt', padding=True, truncation=True, max_length=512)

#checking the tokenized input and target examples:
#print(inputs['input_ids'][0])
#print(targets['input_ids'][0])




In [9]:
#data are split into training and validation sets (80% train, 20% for validation)
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs['input_ids'], targets['input_ids'], test_size=0.2, random_state=42)

In [10]:
#creating datasets and dataloaders from inputs and targets 
train_dataset = TensorDataset(train_inputs, train_targets)
val_dataset = TensorDataset(val_inputs, val_targets)
#dataloaders for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)



In [12]:
#(i always get an error if i set cuda alone that's why i made this if else cpu)
#if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device('cuda')  #GPU is available, use it
else:
    device = torch.device('cpu')  #GPU not available, use CPU

#load pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

#defining optimizer Adamw
optimizer = AdamW(model.parameters(), lr=5e-5)



In [14]:
#training
#can increase more epochs if needed
epochs = 3
best_val_loss = float('inf')
patience = 3 
patience_counter = 0

for epoch in range(epochs):
    
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids, target_ids = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss / len(train_loader)}")

    #validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, target_ids = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, labels=target_ids)
            val_loss += outputs.loss.item()
    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss}")

    #early stopping to prevent overfitting by the model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        model.save_pretrained('./best_fine_tuned_bart')  
    else:
        patience_counter += 1
        if patience_counter >= patience:
    #if model reaches the patience threshold without improvement, stop training
            print("Early stopping initiated!")
            break

#saving the final model after training
model.save_pretrained('./fine_tuned_bart')  
tokenizer.save_pretrained('./fine_tuned_bart') 


Epoch 1/3, Training Loss: 4.787105048289065


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Validation Loss: 1.9432356879115105
Epoch 2/3, Training Loss: 1.1138243577519402


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Validation Loss: 0.741957537829876
Epoch 3/3, Training Loss: 0.698284062205768


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Validation Loss: 0.7312694564461708


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine_tuned_bart\\tokenizer_config.json',
 './fine_tuned_bart\\special_tokens_map.json',
 './fine_tuned_bart\\vocab.json',
 './fine_tuned_bart\\merges.txt',
 './fine_tuned_bart\\added_tokens.json')

In [16]:
#evaluation
def evaluate_metrics(model, val_loader):
    model.eval()
    predictions = []
    references = []
    #tokenized references for word-level comparisons
    tokenized_references = []
    #tokenized predictions for word-level comparisons
    tokenized_predictions = []  
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, target_ids = [b.to(device) for b in batch]
            #can choose to have different num_beams, i have tried wih 7, 10 but kept it at 5.
            generated_ids = model.generate(input_ids=input_ids, max_length=50, num_beams=5, early_stopping=True)
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
            decoded_refs = [tokenizer.decode(t, skip_special_tokens=True) for t in target_ids]
            
            predictions.extend(decoded_preds)
            references.extend(decoded_refs)
            
            #tokenizing for word-level metrics
            for ref in decoded_refs:
                tokenized_references.append(ref.split())
            for pred in decoded_preds:
                tokenized_predictions.append(pred.split())
    
    #calculate BLEU score (for discussion purposes)
    bleu = sacrebleu.corpus_bleu(predictions, [[r] for r in references])
    print(f"BLEU Score: {bleu.score}")

    #the tokenized references are lists of words and I decided to flatten them into a single list
    #for word-level comparisons (forevaluation of precision, recall, F1 score, accuracy)
    #flatten tokenized_references and tokenized_predictions for word-level evaluation
    flat_references = []
    flat_predictions = []
    
    #flatten tokenized_references into flat_references
    for ref in tokenized_references:
        for word in ref:
            flat_references.append(word)

    #flatten tokenized_predictions into flat_predictions
    #similar to references, tokenized predictions need to be flattened for individual word comparisons
    for pred in tokenized_predictions:
        for word in pred:
            flat_predictions.append(word)
    
    #making sure flat_references and flat_predictions have equal lengths
    #making sure the accuracy, precision, recall and F1 scores are calculated correctly
    #this concerns whether the two lists have different lengths then they cannot be directly compared for evaluation
    min_length = min(len(flat_references), len(flat_predictions))
    flat_references = flat_references[:min_length]
    flat_predictions = flat_predictions[:min_length]
    
    #calculating precision, recall, F1, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(flat_references, flat_predictions, average='weighted', zero_division=0)
    accuracy = accuracy_score(flat_references, flat_predictions)

#overall evaluation scores:
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
# Call the evaluation function
evaluate_metrics(model, val_loader)



BLEU Score: 75.98356856515926
Precision: 0.0629
Recall: 0.0630
F1 Score: 0.0620
Accuracy: 0.0630


In [17]:
#testing example Tagalog sentences with input on the left and expected output on the right (with accuracy calculation)
test_sentences = [
    ("Siya ay nagluluto.", "They(sg) are cooking."),               
    ("Sila ay nagtatrabaho.", "They(pl) are working."),
    ("Si Maria ay nagsusulat.", "Maria is writing."),
    ("Si Juan ay naglalaro ng basketball.", "Juan is playing basketball."),
    ("Siya ay natutulog.", "They(sg) are sleeping."),
    ("Siya ay nag-aaral ng Ingles.", "They(sg) are studying English."),
    ("Sila ay nagbabasa ng aklat.", "They(pl) are reading a book."),
    ("Siya ay kumakain ng almusal.", "They(sg) are eating breakfast."),
    ("Si Pedro ay naglilinis ng kotse.", "Pedro is cleaning the car."),
    ("Nagaaral siya ng hapon.", "They(sg) are learning Japanese"),
    ("Si Ana ay nagpapatugtog ng piano.", "Ana is playing the piano.")
]

for test_sentence, expected_translation in test_sentences:
    test_inputs = tokenizer(test_sentence, return_tensors='pt', truncation=True, max_length=128).to(device)

    #generating and decoding translation 
    #can change num_beams
    translated_ids = model.generate(test_inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)
    translated_sentence = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    #word-level accuracy calculation
    expected_words = expected_translation.split()
    translated_words = translated_sentence.split()
    correct_words = 0
    for ew, tw in zip(expected_words, translated_words):
        if ew == tw:
            correct_words += 1
    accuracy = correct_words / max(len(expected_words), len(translated_words)) * 100  
    print(f"Original: {test_sentence}")
    print(f"Expected Translation: {expected_translation}")
    print(f"Generated Translation: {translated_sentence}")
    print(f"Word-Level Accuracy: {accuracy:.2f}%")
    print("=" * 50)


Original: Siya ay nagluluto.
Expected Translation: They(sg) are cooking.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 25.00%
Original: Sila ay nagtatrabaho.
Expected Translation: They(pl) are working.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 25.00%
Original: Si Maria ay nagsusulat.
Expected Translation: Maria is writing.
Generated Translation: Maria is cooking a dish. She is good.
Word-Level Accuracy: 25.00%
Original: Si Juan ay naglalaro ng basketball.
Expected Translation: Juan is playing basketball.
Generated Translation: Juan is playing basketball.
Word-Level Accuracy: 100.00%
Original: Siya ay natutulog.
Expected Translation: They(sg) are sleeping.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 25.00%
Original: Siya ay nag-aaral ng Ingles.
Expected Translation: They(sg) are studying English.
Generated Translation: They are studying math.
Word-Level Accuracy: 50.00%
Original: Sila ay nagbabasa ng aklat.
Expected

In [18]:
#testing example Tagalog sentences with input on the left and expected output on the right (with accuracy and bleu calculation)
test_sentences = [
    ("Siya ay nagluluto.", "They are cooking."),
    ("Sila ay nagtatrabaho.", "They are working."),
    ("Si Maria ay nagsusulat.", "Maria is writing."),
    ("Si Juan ay naglalaro ng basketball.", "Juan is playing basketball."),
    ("Siya ay natutulog.", "They are sleeping."),
    ("Siya ay nag-aaral ng Ingles.", "They are studying English."),
    ("Sila ay nagbabasa ng aklat.", "They are reading a book."),
    ("Siya ay kumakain ng almusal.", "They are eating breakfast."),
    ("Si Pedro ay naglilinis ng kotse.", "Pedro is cleaning the car."),
    ("Si Ana ay nagpapatugtog ng piano.", "Ana is playing the piano.")
]


for test_sentence, expected_translation in test_sentences:
    #tokenizing the test sentence
    test_inputs = tokenizer(test_sentence, return_tensors='pt', truncation=True, max_length=128).to(device)

    #generating and decoding translation 
    translated_ids = model.generate(test_inputs['input_ids'], max_length=50, num_beams=7, early_stopping=True)
    translated_sentence = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    #word-level accuracy
    expected_words = expected_translation.split()
    translated_words = translated_sentence.split()
    correct_words = 0
    for ew, tw in zip(expected_words, translated_words):
        if ew == tw:
            correct_words += 1
    accuracy = correct_words / max(len(expected_words), len(translated_words)) * 100  

    #BLEU score 
    bleu = sacrebleu.corpus_bleu([translated_sentence], [[expected_translation]])
    print(f"Original: {test_sentence}")
    print(f"Expected Translation: {expected_translation}")
    print(f"Generated Translation: {translated_sentence}")
    print(f"Word-Level Accuracy: {accuracy:.2f}%")
    print(f"Sentence BLEU Score: {bleu.score:.2f}")
    print("=" * 50) 



Original: Siya ay nagluluto.
Expected Translation: They are cooking.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 50.00%
Sentence BLEU Score: 23.64
Original: Sila ay nagtatrabaho.
Expected Translation: They are working.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 50.00%
Sentence BLEU Score: 23.64
Original: Si Maria ay nagsusulat.
Expected Translation: Maria is writing.
Generated Translation: Maria is cooking a dish. She is good.
Word-Level Accuracy: 25.00%
Sentence BLEU Score: 9.29
Original: Si Juan ay naglalaro ng basketball.
Expected Translation: Juan is playing basketball.
Generated Translation: Juan is playing basketball.
Word-Level Accuracy: 100.00%
Sentence BLEU Score: 100.00
Original: Siya ay natutulog.
Expected Translation: They are sleeping.
Generated Translation: They are buying fruits.
Word-Level Accuracy: 50.00%
Sentence BLEU Score: 23.64
Original: Siya ay nag-aaral ng Ingles.
Expected Translation: They are studying English.
Ge