In [1]:
import nltk
from nltk.translate import AlignedSent, IBMModel1, bleu_score
from nltk.tokenize import word_tokenize
import os
import random
import numpy as np
from collections import defaultdict
# import pickle
import dill as pickle

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [2]:
data_path = 'data/clean'

def load_data(lang_dir):
    """Loads and tokenizes data from a language directory."""
    en_path = os.path.join(data_path, lang_dir, 'all.en')
    es_path = os.path.join(data_path, lang_dir, 'all.es')
    
    if not os.path.exists(en_path) or not os.path.exists(es_path):
        return None
        
    with open(en_path, 'r', encoding='utf-8') as f:
        en_lines = f.readlines()
    with open(es_path, 'r', encoding='utf-8') as f:
        es_lines = f.readlines()
        
    # Basic tokenization
    en_tokens = [word_tokenize(line.strip().lower()) for line in en_lines]
    es_tokens = [word_tokenize(line.strip().lower()) for line in es_lines]
    
    # Filter out empty lines
    data = []
    for en, es in zip(en_tokens, es_tokens):
        if en and es:
            data.append((en, es))
            
    return data

In [None]:
def train(train_data, iterations=10):
    """Trains IBM Model 1 on the given training data.
    
    Args:
        train_data: List of (source_tokens, target_tokens) tuples
        iterations: Number of EM iterations (default: 10)
    
    Returns:
        Trained IBMModel1 instance
    """
    aligned_corpus = [AlignedSent(target, source) for source, target in train_data]
    ibm1 = IBMModel1(aligned_corpus, iterations)

    if isinstance(ibm1.translation_table, defaultdict):
        ibm1.translation_table.default_factory = None
        for v in ibm1.translation_table.values():
            if isinstance(v, defaultdict):
                v.default_factory = None
    
    return ibm1

def build_translation_dict(model):
    """Builds a translation dictionary from the trained model.
    
    Args:
        model: Trained IBMModel1 instance
    
    Returns:
        Dictionary mapping source words to target words
    """
    translation_dict = {}
    s_to_t_probs = defaultdict(list)
    
    # Extract translation probabilities
    for t in model.translation_table:
        for s in model.translation_table[t]:
            prob = model.translation_table[t][s]
            if prob > 1e-6:
                s_to_t_probs[s].append((t, prob))
    
    # For each source word, pick the target word with highest probability
    for s in s_to_t_probs:
        best_t = sorted(s_to_t_probs[s], key=lambda x: x[1], reverse=True)[0][0]
        translation_dict[s] = best_t
    
    return translation_dict


def translate(model, source_tokens):
    """Translates source tokens using the trained model.
    
    Args:
        model: Trained IBMModel1 instance
        source_tokens: List of source language tokens
    
    Returns:
        List of target language tokens
    """
    translation_dict = build_translation_dict(model)
    translated = []
    
    for word in source_tokens:
        if word in translation_dict:
            translated.append(translation_dict[word])
        else:
            translated.append(word)  # Keep original if no translation found
    
    return translated


def evaluate(model, test_data):
    """Evaluates the model on test data using BLEU score.
    
    Args:
        model: Trained IBMModel1 instance
        test_data: List of (source_tokens, target_tokens) tuples
    
    Returns:
        BLEU score (float)
    """
    references = []
    hypotheses = []
    
    for source, target in test_data:
        translated = translate(model, source)
        references.append([target])
        hypotheses.append(translated)
    
    score = bleu_score.corpus_bleu(references, hypotheses)
    return score


def save_model(model, model_path):
    """Saves the trained model to disk using pickle.
    
    Args:
        model: Trained IBMModel1 instance
        model_path: Path where to save the model
    """
    # Create directory if it doesn't exist
    model_dir = os.path.dirname(model_path)
    if model_dir:
        os.makedirs(model_dir, exist_ok=True)
    
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_path}")


In [8]:
def train_and_evaluate(lang_dir, save_model_flag=False):
    """Convenience function to train and evaluate a model for a language directory.
    
    Args:
        lang_dir: Language directory name
        save_model_flag: Whether to save the trained model (default: False)
    
    Returns:
        BLEU score (float) or None if data not found
    """
    print(f"Processing {lang_dir}...")
    data = load_data(lang_dir)
    if not data:
        print(f"Skipping {lang_dir} (files not found or empty)")
        return None
        
    # Split data
    random.seed(42)
    random.shuffle(data)
    split_idx = int(len(data) * 0.8)
    train_data = data[:split_idx]
    test_data = data[split_idx:]
    
    print(f"  Training on {len(train_data)} sentences...")
    model = train(train_data, iterations=10)
    
    # Save model if requested
    if save_model_flag:
        model_path = os.path.join('models', f'{lang_dir}_ibm1.pkl')
        save_model(model, model_path)
    
    # print("  Evaluating...")
    # score = evaluate(model, test_data)
    # print(f"  BLEU Score: {score:.4f}")
    return # score


# Run for all datasets
subdirs = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
results = {}

for d in sorted(subdirs):
    score = train_and_evaluate(d, True)
    # if score is not None:
        # results[d] = score
        
print("\nFinal Results:")
# for lang, score in results.items():
    # print(f"{lang}: {score:.4f}")

Processing es-AR...


  Training on 115077 sentences...
Model saved to models/es-AR_ibm1.pkl
Processing es-CL...
  Training on 115077 sentences...
Model saved to models/es-CL_ibm1.pkl
Processing es-CO...
  Training on 115076 sentences...
Model saved to models/es-CO_ibm1.pkl
Processing es-CR...
  Training on 115077 sentences...
Model saved to models/es-CR_ibm1.pkl
Processing es-DO...
  Training on 115077 sentences...
Model saved to models/es-DO_ibm1.pkl
Processing es-EC...
  Training on 115076 sentences...
Model saved to models/es-EC_ibm1.pkl
Processing es-HN...
  Training on 115077 sentences...
Model saved to models/es-HN_ibm1.pkl
Processing es-NI...
  Training on 115077 sentences...
Model saved to models/es-NI_ibm1.pkl
Processing es-PA...
  Training on 115077 sentences...
Model saved to models/es-PA_ibm1.pkl
Processing es-PE...
  Training on 115076 sentences...
Model saved to models/es-PE_ibm1.pkl
Processing es-PR...
  Training on 115076 sentences...
Model saved to models/es-PR_ibm1.pkl
Processing es-SV...

TimeoutError: [Errno 60] Operation timed out

In [None]:
results