In [4]:
import pandas as pd
import random
import numpy as np

# Load sentence and link data
sentences = pd.read_csv("sentences.csv", sep="\t", names=["id", "lang", "text"])
links = pd.read_csv("links.csv", sep="\t", names=["id", "translation_id"])

In [30]:
# Find top 9 languages besides English
top_langs = sentences['lang'].value_counts().drop('eng').head(24).index.tolist()
top_langs.insert(0, 'eng')
min_chars = 40
sent_top = sentences[(sentences['lang'].isin(top_langs)) & (sentences['text'].str.len() > min_chars)]
lang_groups = {lang: sent_top[sent_top['lang'] == lang]['text'].tolist() for lang in top_langs}
sent_top

Unnamed: 0,id,lang,text
78,81,deu,Heute ist der 18. Juni und das ist der Geburts...
79,82,deu,"Herzlichen Glückwunsch zum Geburtstag, Muiriel!"
85,89,deu,"Ich weiß einfach nicht, was ich sagen soll."
89,94,deu,Aus irgendeinem Grund hat das Mikrofon gerade ...
91,96,deu,Die Ausbildung in dieser Welt enttäuscht mich.
...,...,...,...
12647237,13127532,eng,"I told them they'd be miserable, too, if they ..."
12647238,13127533,eng,"Don't call Tom ""Mary."" That's not his name any..."
12647239,13127534,eng,I hate to be the one to say I told you so.
12647258,13127554,epo,Neniu estas perfekta. Ĉies postaĵo ja havas fe...


In [29]:
def split_sentence(sentence):
    words = sentence.split()
    mid = len(words) // 2
    first_half = " ".join(words[:mid])
    second_half = " ".join(words[mid:])
    return first_half, second_half

In [19]:
def create_multilingual_dataset(lang_groups, num_segments=3, num_samples=100, needle_pos=0):
    ds = []
    available_langs = list(lang_groups.keys())
    
    for _ in range(num_samples):
        segments = []
        needle_query = ""
        for i in range(num_segments):
            lang = random.choice(available_langs)
            sentence = random.choice(lang_groups[lang])
            if i == needle_pos:
                # first half in segments second is needle_query
                a, b = split_sentence(sentence)
                segment = f"<{lang}> {a} </{lang}>"
                needle_query = f"<{lang}> {b} </{lang}>"
            else:
                segment = f"<{lang}> {sentence} </{lang}>"
            segments.append(segment)
        haystack = "".join(segments)
        test_example = haystack + needle_query
        ds.append(test_example)
    
    return pd.Series(ds, name="sequence")

In [25]:
create_multilingual_dataset(lang_groups, num_segments = 3, num_samples = 100, needle_pos = 0)[0]

"<eng> Oklahoma's panhandle used to </eng><rus> Том сказал мне, что он хочет это сделать. </rus><kab> Amek ihi ara ad naru tilelli s tbengalit? </kab><eng> be part of Texas. </eng>"