In [1]:
import pandas as pd

# Load sentence and link data
sentences = pd.read_csv("sentences.csv", sep="\t", names=["id", "lang", "text"])
links = pd.read_csv("links.csv", sep="\t", names=["id", "translation_id"])

In [2]:
# Find top 4 languages besides English
top_langs = sentences['lang'].value_counts().drop('eng').head(4).index.tolist()
top_langs.insert(0, 'eng')
sent_top = sentences[sentences['lang'].isin(top_langs)]


In [3]:
df_eng = sent_top[sent_top['lang'] == 'eng'][['id', 'text']].rename(columns={'text': 'eng'})
df_translations = {}
for lang in top_langs[1:]:
    # Get sentences in the target language and rename for the merge
    lang_sent = sent_top[sent_top['lang'] == lang][['id', 'text']].rename(
        columns={'id': 'translation_id', 'text': lang}
    )
    # Merge the links with the target language sentences.
    # This step finds all translations in 'lang'. We use an inner join so we only keep valid pairs.
    links_lang = pd.merge(links, lang_sent, on='translation_id', how='inner')
    
    # Here, links_lang["id"] is the English sentence ID linked to this translation.
    # If there are multiple translations for the same English sentence, we select the first one.
    links_lang = links_lang.groupby('id')[lang].first().reset_index()
    df_translations[lang] = links_lang
df_wide = df_eng.copy()
for lang in top_langs[1:]:
    # Perform an inner join on the English sentence id so that sentences missing this translation are dropped
    df_wide = pd.merge(df_wide, df_translations[lang], on="id", how="inner")

In [4]:
df_wide.drop(columns="id")

Unnamed: 0,eng,rus,ita,epo,kab
0,Let's try something.,Давайте что-нибудь попробуем!,Proviamo qualcosa!,Ni provu ion!,Aha ad neɛreḍ kra.
1,I have to go to sleep.,Мне пора идти спать.,Devo andare a dormire.,Mi devas enlitiĝi.,Ilaq ad ruḥeɣ ad gneɣ.
2,Today is June 18th and it is Muiriel's birthday!,"Сегодня 18 июня, и это день рождения Мюриэл!",Oggi è il 18 giugno ed è il compleanno di Muir...,Hodiaŭ estas la dek-oka de junio kaj la naskiĝ...,Ass-a d wis 18 deg yunyu yerna d amulli n Muir...
3,"The password is ""Muiriel"".",Пароль «Muiriel».,"La parola d'accesso è ""Muiriel"".","La pasvorto estas ""Muiriel"".","Awal uffir, d ""Muriel""."
4,I will be back soon.,Я скоро вернусь.,Torno subito.,Mi baldaŭ revenos.,Ad d-uɣaleɣ zik.
...,...,...,...,...,...
9844,Go and call the guests.,Иди позови гостей.,Va' e chiama gli ospiti.,Iru voki la gastojn.,Ruḥ ssiwel-d i yinebgawen-nni.
9845,Need some help?,Нужна помощь?,Serve aiuto?,Ĉu vi bezonas helpon?,Tesriḍ i kra n tallelt?
9846,Wikipedia is a free encyclopedia.,Википедия - свободная энциклопедия.,Wikipedia è un'enciclopedia libera.,Vikipedio estas libera enciklopedio.,"Wikipedia, d tasanayt tilellit."
9847,Some think teaching is an easy profession.,"Некоторые думают, что преподавание - лёгкая пр...",Certuni credono che insegnare sia una professi...,"Iuj opinias, ke instrui estas facila profesio.",Kra ttɣilin yeshel ad tiliḍ d aselmad.


In [14]:
import numpy as np

In [29]:
import random
def create_multilingual_dataset(df, num_segments = 3, num_samples=100):
    ds = []
    langs = df_wide.columns.to_list()[1:]
    for _ in range(num_samples):
        segments = []
        rowidx = np.random.randint(0, len(df))
        for _ in range(num_segments):
            # Randomly choose a language from the provided language codes.
            lang = np.random.choice(langs)
            sentence = df.iloc[rowidx][lang]
            # Build the segment with language-specific tags.
            segment = f"<{lang}> {sentence} </{lang}>"
            segments.append(segment)
        # Concatenate the segments to form a single test example.
        example = "".join(segments)
        ds.append(example)
    return pd.Series(ds, name="sequence")

In [30]:
create_multilingual_dataset(df_wide, num_segments = 3, num_samples = 100)[0]

'<epo> Tomo scias, ke vi mensogas. </epo><kab> Yeẓṛa Tom belli teskiddibeḍ. </kab><ita> Tom sa che stai mentendo. </ita>'