In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('treebank')
nltk.download('universal_tagset')

from nltk.corpus import wordnet as wn
from nltk.corpus import treebank
from collections import defaultdict
import random
import json

wordnet_pos_map = {
    'NOUN': wn.NOUN,
    'VERB': wn.VERB,
    'ADJ': wn.ADJ,
    'ADV': wn.ADV
}

ud_pos_list = ['DET', 'PRON', 'ADP', 'CONJ', 'NUM', 'INTJ']

wordnet_words = defaultdict(set)
limit_per_pos = 100

for pos_name, wn_pos in wordnet_pos_map.items():
    for synset in wn.all_synsets(wn_pos):
        for lemma in synset.lemmas():
            word = lemma.name().replace("_", " ").lower()
            if word.isalpha():
                wordnet_words[pos_name].add(word)
        if len(wordnet_words[pos_name]) >= limit_per_pos * 2:
            break  

treebank_words = defaultdict(set)
tagged_sents = treebank.tagged_sents(tagset='universal')

for sent in tagged_sents:
    for word, tag in sent:
        tag = tag.upper()
        if tag in ud_pos_list and word.isalpha():
            treebank_words[tag].add(word.lower())
    if all(len(treebank_words[t]) >= limit_per_pos * 2 for t in ud_pos_list):
        break

combined_pos_words = {}

for pos in wordnet_words:
    words = list(wordnet_words[pos])
    sample_size = min(limit_per_pos, len(words))
    combined_pos_words[pos] = sorted(random.sample(words, sample_size))

for pos in ud_pos_list:
    words = list(treebank_words[pos])
    sample_size = min(limit_per_pos, len(words))
    combined_pos_words[pos] = sorted(random.sample(words, sample_size))

#Save to JSON
with open("english_words_by_pos.json", "w", encoding="utf-8") as f:
    json.dump(combined_pos_words, f, ensure_ascii=False, indent=2)

print(f"✅ Done. File saved with up to {limit_per_pos} words per POS: english_words_by_pos.json")


In [None]:
!pip install transformers sentencepiece --quiet

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import pandas as pd
from tqdm import tqdm
import json
import os

with open("english_words_by_pos.json", encoding="utf-8") as f:
    english_words = json.load(f)

device = 0 if torch.cuda.is_available() else -1
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lang_code_map = {
    "FRA": "fra_Latn",
    "BAM": "bam_Latn",
    "WOL": "wol_Latn"
}

translator = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="eng_Latn", device=device)

def translate_batch(batch, tgt_lang):
    results = []
    try:
        translations = translator(batch, tgt_lang=tgt_lang, max_length=128)
        for t in translations:
            results.append(t['translation_text'])
    except Exception:
        results = ["NA"] * len(batch)
    return results

final_blocks = []

for pos, words in english_words.items():
    print(f"\n🔵 Translating POS: {pos}")
    df = pd.DataFrame({"ENG": words})
    
    for lang_code, tgt_lang in lang_code_map.items():
        translations = []
        for i in tqdm(range(0, len(words), 8), desc=f"{pos} → {lang_code}"):
            batch = words[i:i+8]
            translated = translate_batch(batch, tgt_lang)
            translations.extend(translated)
        df[lang_code] = translations
    
    df.columns = [f"{col}_{pos}" if col != "ENG" else f"ENG_{pos}" for col in df.columns]
    final_blocks.append(df)

#export
final_df = pd.concat(final_blocks, axis=1)
final_df.to_csv("multilingual_pos_dataset.csv", index=False, encoding="utf-8-sig")
print("\n✅ Final CSV saved: multilingual_pos_dataset.csv")
