In [None]:
import os

data_path = r"C:\Users\shang\Desktop\clean"

if os.path.exists(data_path):
    print(os.listdir(data_path))
    print("The file exists")
else:
    print(f"{data_path} does not exist")

In [None]:
import os

region_data = {}

if os.path.exists(data_path):
    sub_folders = sorted(os.listdir(data_path))

    for folder_name in sub_folders:
        folder_full_path = os.path.join(data_path, folder_name)

        if os.path.isdir(folder_full_path) and folder_name.startswith('es-'):
            path_en = os.path.join(folder_full_path, 'all.en')
            path_es = os.path.join(folder_full_path, 'all.es')

            if os.path.exists(path_en) and os.path.exists(path_es):
                with open(path_en, 'r', encoding='utf-8') as f:
                    lines_en = f.read().strip().split('\n')
                with open(path_es, 'r', encoding='utf-8') as f:
                    lines_es = f.read().strip().split('\n')

                current_pairs = []
                if len(lines_en) == len(lines_es):
                    for en, es in zip(lines_en, lines_es):
                        if en.strip() and es.strip():
                            current_pairs.append([en.strip(), es.strip()])

                    region_data[folder_name] = current_pairs


print("regions:", list(region_data.keys()))

In [None]:
import re

SOS_token = 0
EOS_token = 1
UNK_token = 2  
MAX_LENGTH = 20

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "UNK"} 
        self.n_words = 3
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?¿¡,])", r" \1", s)
    s = re.sub(r"[^a-zA-ZáéíóúñÁÉÍÓÚÑ.!?¿¡,]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
           len(p[1].split(' ')) < MAX_LENGTH

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

def indexesFromSentence(lang, sentence):
    indexes = []
    for word in sentence.split(' '):
        if word in lang.word2index:
            indexes.append(lang.word2index[word])
        else:
            indexes.append(UNK_token) 
    return indexes

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair, input_lang, output_lang):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.rnn = nn.RNN(hidden_size, hidden_size) 

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        output = embedded

        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)

        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)

        self.rnn = nn.RNN(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)

        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def validate_epoch(encoder, decoder, validation_pairs, input_lang, output_lang, criterion):

    encoder.eval()
    decoder.eval()

    total_loss = 0
    total_correct_tokens = 0
    total_tokens = 0

    with torch.no_grad(): 
        for pair in validation_pairs:
            input_tensor = tensorFromSentence(input_lang, pair[0])
            target_tensor = tensorFromSentence(output_lang, pair[1])

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            encoder_hidden = encoder.initHidden()
            encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
                encoder_outputs[ei] = encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden 

            for di in range(target_length):
                decoder_output, decoder_hidden, _ = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)

                loss = criterion(decoder_output, target_tensor[di])
                total_loss += loss.item()

                topv, topi = decoder_output.topk(1)
                if topi.item() == target_tensor[di].item():
                    total_correct_tokens += 1
                
                total_tokens += 1

                decoder_input = topi.squeeze().detach()

                if decoder_input.item() == EOS_token:
                    break

    encoder.train()
    decoder.train()

    avg_loss = total_loss / total_tokens if total_tokens > 0 else 0
    avg_acc = total_correct_tokens / total_tokens if total_tokens > 0 else 0
    
    return avg_loss, avg_acc

In [None]:
import random

def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    correct_tokens = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < 0.5 else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            topv, topi = decoder_output.topk(1)
            if topi.item() == target_tensor[di].item():
                correct_tokens += 1

            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di] 

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            topv, topi = decoder_output.topk(1)
            if topi.item() == target_tensor[di].item():
                correct_tokens += 1

            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length, correct_tokens / target_length

In [None]:
from tqdm import tqdm
from torch import optim

def train_specific_region_and_return_data(region_name, n_epochs=5, learning_rate=0.001, save_dir="models"):

    if region_name not in region_data:
        print(f"Error: region {region_name} not found")
        return None
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    raw_pairs = region_data[region_name]

    input_lang = Lang("eng")
    output_lang = Lang("spa")

    clean_pairs = []
    for en, es in raw_pairs:
        clean_en = normalizeString(en)
        clean_es = normalizeString(es)
        if len(clean_en.split()) < MAX_LENGTH and len(clean_es.split()) < MAX_LENGTH:
            clean_pairs.append([clean_en, clean_es])
            input_lang.addSentence(clean_en)
            output_lang.addSentence(clean_es)

    random.shuffle(clean_pairs)
    val_split = int(len(clean_pairs) * 0.8)
    
    train_pairs = clean_pairs[:val_split]
    test_pairs = clean_pairs[val_split:] 

    hidden_size = 256
    encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
    decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    steps_per_epoch = 5000
    
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        epoch_acc = 0
        
        with tqdm(total=steps_per_epoch, unit="step", desc=f"Epoch {epoch}") as pbar:
            for i in range(steps_per_epoch):
                pair = random.choice(train_pairs)
                input_tensor = tensorFromSentence(input_lang, pair[0])
                target_tensor = tensorFromSentence(output_lang, pair[1])
                
                loss, acc = train_step(input_tensor, target_tensor, encoder, decoder, 
                                     encoder_optimizer, decoder_optimizer, criterion)
                epoch_loss += loss
                epoch_acc += acc
                pbar.set_postfix({'loss': f'{epoch_loss/(i+1):.3f}', 'acc': f'{epoch_acc/(i+1):.3f}'})
                pbar.update(1)
            
            val_loss, val_acc = validate_epoch(
            encoder, decoder, test_pairs[:200], input_lang, output_lang, criterion
        )
        
        print(f"Epoch {epoch} | Train Loss: {epoch_loss/steps_per_epoch:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    print(f"{region_name} training completed!")

    save_path = os.path.join(save_dir, f"model_{region_name}.pt")
    
    checkpoint = {
        'encoder_state_dict': encoder.state_dict(),
        'decoder_state_dict': decoder.state_dict(),
        'input_lang': input_lang, 
        'output_lang': output_lang,
        'hidden_size': hidden_size
    }
    
    torch.save(checkpoint, save_path)
    print(f"Model saved to: {save_path}")
    
    return encoder, decoder, input_lang, output_lang, test_pairs

In [None]:
import evaluate

def generate_translations(encoder, decoder, test_pairs, input_lang, output_lang):
    sources = []
    references = []
    predictions = []
    
    encoder.eval()
    decoder.eval()
    
    
    with torch.no_grad():
        for pair in tqdm(test_pairs):
            src_text = pair[0]
            ref_text = pair[1]
            
            input_tensor = tensorFromSentence(input_lang, src_text)
            input_length = input_tensor.size(0)
            
            encoder_hidden = encoder.initHidden()
            encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
                encoder_outputs[ei] = encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            decoded_words = []
            
            for di in range(MAX_LENGTH):
                decoder_output, decoder_hidden, _ = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                topv, topi = decoder_output.topk(1)
                
                if topi.item() == EOS_token:
                    break
                else:
                    decoded_words.append(output_lang.index2word[topi.item()])
                
                decoder_input = topi.squeeze().detach()
            
            pred_text = ' '.join(decoded_words)
            
            sources.append(src_text)
            references.append(ref_text)
            predictions.append(pred_text)
            
    return sources, references, predictions

import evaluate

def compute_metrics(sources, references, predictions):
    results = {}

    metric_bleu = evaluate.load("sacrebleu")
    metric_chrf = evaluate.load("chrf")
    metric_meteor = evaluate.load("meteor")
    metric_comet  = evaluate.load("comet") 

    formatted_refs = [[r] for r in references]
    
    bleu_res = metric_bleu.compute(predictions=predictions, references=formatted_refs)
    results['BLEU'] = bleu_res['score']
    print(f"BLEU: {results['BLEU']:.2f}")

    chrf_res = metric_chrf.compute(predictions=predictions, references=formatted_refs)
    results['chrF'] = chrf_res['score']
    print(f"chrF: {results['chrF']:.2f}")


    meteor_res = metric_meteor.compute(predictions=predictions, references=references)
    results['METEOR'] = meteor_res['meteor']
    print(f"METEOR: {results['METEOR']:.4f}")


    comet_res = metric_comet.compute(predictions=predictions, references=references, sources=sources)
    results['COMET'] = comet_res['mean_score']
    print(f"COMET: {results['COMET']:.4f}")

    return results

In [None]:
def indexesFromSentence(lang, sentence):
    indexes = []
    for word in sentence.split(' '):
        if word in lang.word2index:
            indexes.append(lang.word2index[word])
        else:

            continue 
    return indexes

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [None]:
target_regions = [
    'es-SV', 'es-PE', 'es-NI', 'es-DO', 'es-EC', 
    'es-PA', 'es-PR', 'es-UY', 'es-CO', 'es-CR', 
    'es-VE', 'es-AR', 'es-HN', 'es-CL'
]

final_results = {}

for region in target_regions:
    encoder, decoder, lang_in, lang_out, test_data = train_specific_region_and_return_data(
        region, n_epochs=5, learning_rate=0.001
    )

    eval_subset = test_data[:200]
    
    srcs, refs, preds = generate_translations(encoder, decoder, eval_subset, lang_in, lang_out)
    scores = compute_metrics(srcs, refs, preds)
    
    for metric, score in scores.items():
        val = score if isinstance(score, (int, float)) else 0.0
        print(f"{metric:<10}: {val:.4f}")
    
    final_results[region] = scores

print(f"{'Region':<10} {'BLEU':<10} {'COMET':<10}")
for region, scores in final_results.items():
    bleu = scores.get('BLEU', 0)
    comet = scores.get('COMET', 0)
    print(f"{region:<10} {bleu:.2f}       {comet:.4f}")
