In [1]:
import nltk
from nltk.translate import AlignedSent, IBMModel1, bleu_score
from nltk.tokenize import word_tokenize
import os
import random
import numpy as np
from collections import defaultdict
import torch

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [2]:
data_path = 'data/clean'

def load_data(lang_dir):
    """Loads and tokenizes data from a language directory."""
    en_path = os.path.join(data_path, lang_dir, 'all.en')
    es_path = os.path.join(data_path, lang_dir, 'all.es')
    
    if not os.path.exists(en_path) or not os.path.exists(es_path):
        return None
        
    with open(en_path, 'r', encoding='utf-8') as f:
        en_lines = f.readlines()
    with open(es_path, 'r', encoding='utf-8') as f:
        es_lines = f.readlines()
        
    # Basic tokenization
    en_tokens = [word_tokenize(line.strip().lower()) for line in en_lines]
    es_tokens = [word_tokenize(line.strip().lower()) for line in es_lines]
    
    # Filter out empty lines
    data = []
    for en, es in zip(en_tokens, es_tokens):
        if en and es:
            data.append((en, es))
            
    return data

In [3]:
def train_and_evaluate(lang_dir):
    print(f"Processing {lang_dir}...")
    data = load_data(lang_dir)
    if not data:
        print(f"Skipping {lang_dir} (files not found or empty)")
        return None
        
    # Split data
    random.seed(42)
    random.shuffle(data)
    split_idx = int(len(data) * 0.8)
    train_data = data[:split_idx]
    test_data = data[split_idx:]
    
    print(f"  Training on {len(train_data)} sentences...")
    aligned_corpus = [AlignedSent(es, en) for en, es in train_data]
    
    # Train IBM Model 1
    ibm1 = IBMModel1(aligned_corpus, 10)

    # Clean model for saving (remove lambdas)
    if isinstance(ibm1.translation_table, defaultdict):
        ibm1.translation_table.default_factory = None
        for v in ibm1.translation_table.values():
            if isinstance(v, defaultdict):
                v.default_factory = None

    # Save model
    model_path = os.path.join('saved_models', f'{lang_dir}_ibm1.pt')
    torch.save(ibm1, model_path)
    print(f"  Model saved to {model_path}")

    translation_dict = {}

    src_vocab = set()
    for en, _ in train_data:
        src_vocab.update(en)
    
    s_to_t_probs = defaultdict(list)
    
    for t in ibm1.translation_table:
        for s in ibm1.translation_table[t]:
            prob = ibm1.translation_table[t][s]
            if prob > 1e-6:
                s_to_t_probs[s].append((t, prob))
                
    for s in s_to_t_probs:
        # Sort by prob descending
        best_t = sorted(s_to_t_probs[s], key=lambda x: x[1], reverse=True)[0][0]
        translation_dict[s] = best_t
    
    print("  Evaluating...")
    references = []
    hypotheses = []
    
    for en, es in test_data:
        trans = []
        for word in en:
            if word in translation_dict:
                trans.append(translation_dict[word])
            else:
                trans.append(word)
        
        references.append([es])
        hypotheses.append(trans)
        
    score = bleu_score.corpus_bleu(references, hypotheses)
    print(f"  BLEU Score: {score:.4f}")
    return score

In [4]:
# Run for all datasets
subdirs = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
results = {}

for d in sorted(subdirs):
    score = train_and_evaluate(d)
    if score is not None:
        results[d] = score
        
print("\nFinal Results:")
for lang, score in results.items():
    print(f"{lang}: {score:.4f}")

Processing es-AR...
  Training on 115077 sentences...


AttributeError: Can't get local object 'IBMModel.reset_probabilities.<locals>.<lambda>'

In [7]:
results

{'es-AR': 0.1366027570679373,
 'es-CL': 0.13660496465381294,
 'es-CO': 0.13511849107394103,
 'es-CR': 0.13659905452238438,
 'es-DO': 0.13661045964864169,
 'es-EC': 0.13511849107394103,
 'es-HN': 0.13660496465381294,
 'es-NI': 0.13660496465381294,
 'es-PA': 0.13660496465381294,
 'es-PE': 0.13511849107394103,
 'es-PR': 0.13511849107394103,
 'es-SV': 0.13511849107394103,
 'es-UY': 0.13659001270328908,
 'es-VE': 0.13511849107394103}