In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('eng-kir.csv', header=None, delimiter='\t')

# Function to count words in a string
def word_count(text):
    return len(str(text).split())

# Filter rows where either column has more than 10 words
df_filtered = df[(df[0].apply(word_count) <= 10)]

In [14]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df_filtered, test_size=0.2, random_state=42)

# Save the split DataFrames if needed
train_df.to_csv('./engkir_train_en-sw.csv', index=False)
eval_df.to_csv('./engkir_eval_en-sw.csv', index=False)

print("Data split into 80% training and 20% evaluation.")

Data split into 80% training and 20% evaluation.


In [16]:
eval_df

Unnamed: 0,0,1
2158,Then the word of the Lord came to Jeremiah:,Ico ni co catumye ijambo ry’Uhoraho riza kuri ...
27909,And they stayed there a long time with the dis...,Bamarana n’abigishwa igihe kitari gito.
19909,"Then the Lord said to Aaron,","Uhoraho abarira Aroni,"
16978,“You shall not give false testimony against yo...,Ntukāgirize ibinyoma mugenzawe.
18197,Then Bildad the Shuhite replied:,"Maze Biludadi w’i Shuhi arishura,"
...,...,...
21292,"“Son of man, take up a lament concerning Tyre.","riti Nawe, mwana w’umuntu, cura intimba uririr..."
22634,"“Go, tell them to return to their tents.","Ati Genda ubabgire, uti Ni mwisubirire mu mahe..."
26602,"to deny people their rights before the Most High,",Canke kugoreka urubanza rw’ umuntu mu nyonga z...
8355,"His division numbers 53,400.",ingabo zāharūwe zabo zar’ ibihumbi mirongwitan...


In [15]:
train_df

Unnamed: 0,0,1
27201,"Mizpah, Kephirah, Mozah,","na Misipe na Kefira na Mosa,"
28861,Again his Jewish opponents picked up stones to...,"Bun’ Abayuda bongera gutora amabuye, ngo bayam..."
13830,What do workers gain from their toil?,Mbeg’ ūkora aronka inyungu ki mu vyamutamije?
783,"As for Rehabiah, from his sons: Ishiah was the...","ku vya Rehebiya, mu bana biwe Ishiya ni we yar..."
769,"The first lot fell to Jehoiarib, the second to...","Ubupfindo bga mbere bgerekana Yehoyaribu, ubga..."
...,...,...
17503,and forty silver bases—two under each frame.,na zo azicurira ibitereko vyazo mirongwine mw ...
21168,Again the word of the Lord came to me:,"Nukw ijambo ry’Uhoraho rinzako,"
10074,"in Hazar Shual, in Beersheba and its settlements,",n’i Hasari‐shuwali n’i Beri‐sheba no mu bisaga...
24073,The farmer sows the word.,Umubivyi abiba ijambo ry’Imana.


In [9]:
df_filtered.to_csv('./engkir_train_en-sw.csv', index=False)

In [17]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset("csv", data_files='./engkir_filtered_en-sw.csv')

Generating train split: 1832 examples [00:00, 261983.12 examples/s]


In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['0', '1'],
        num_rows: 1832
    })
})

In [43]:
list_of_tuples = list(train_df.itertuples(index=False, name=None))

In [44]:
import json

data = {
    "data": [
        {"translation": {"en": src, "sw": tgt}} for src, tgt in list_of_tuples
    ]
}

In [45]:
data['data']

[{'translation': {'en': 'Mizpah, Kephirah, Mozah,',
   'sw': 'na Misipe na Kefira na Mosa,'}},
 {'translation': {'en': 'Again his Jewish opponents picked up stones to stone him,',
   'sw': 'Bun’ Abayuda bongera gutora amabuye, ngo bayamutere.'}},
 {'translation': {'en': 'What do workers gain from their toil?',
   'sw': 'Mbeg’ ūkora aronka inyungu ki mu vyamutamije?'}},
 {'translation': {'en': 'As for Rehabiah, from his sons: Ishiah was the first.',
   'sw': 'ku vya Rehebiya, mu bana biwe Ishiya ni we yar’ umukuru;'}},
 {'translation': {'en': 'The first lot fell to Jehoiarib, the second to Jedaiah,',
   'sw': 'Ubupfindo bga mbere bgerekana Yehoyaribu, ubga kabiri Yedaya;'}},
 {'translation': {'en': 'Ahinadab son of Iddo—in Mahanaim;',
   'sw': 'na Ahinadabu mwene Ido yatoza ivy’ i Mahanayimu;'}},
 {'translation': {'en': 'The Israelites left Rameses and camped at Sukkoth.',
   'sw': 'Nukw Abisirayeli ni kwo kuvuduka i Ramesesi, basagaza i Sukoti.'}},
 {'translation': {'en': 'We submitted

In [46]:
output_filename = 'translation_train_en-sw.json'

# Write to JSON file
with open(output_filename, 'w') as outfile:
    json.dump(data, outfile, ensure_ascii=False, indent=2)

In [32]:
dataset = load_dataset("json", data_files='translation_train_en-sw.json', field="data")
print(dataset)


Generating train split: 1465 examples [00:00, 211301.77 examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1465
    })
})





In [31]:
def getlangs(fn):
    pair = fn.split(".")[0].split("_")[len(fn.split(".")[0].split("_")) - 1]
    print(pair)

    return [pair.split("-")[0], pair.split("-")[1]]

In [33]:
srcl, tgtl = getlangs('translation_train_en-sw.json')

print(f"{srcl}, {tgtl}")

en-sw
en, sw


In [34]:
def preproc_func(examples):
    ins = [ex[srcl] for ex in examples['translation']]
    outs = [ex[tgtl] for ex in examples['translation']]

    print(ins)
    print(outs)

    # result = tokenizer(ins, max_length=128, padding=True, truncation=True)

    # with tokenizer.as_target_tokenizer():
    #     labels = tokenizer(outs, max_length=128, padding=True, truncation=True)

    # result['labels'] = labels['input_ids']

    # return result

ready_data = dataset['train'].map(preproc_func, batched=True, remove_columns=['translation'])

# return ready_data

Map: 100%|██████████| 1465/1465 [00:00<00:00, 266036.95 examples/s]

['na Misipe na Kefira na Mosa,', 'Bun’ Abayuda bongera gutora amabuye, ngo bayamutere.', 'Mbeg’ ūkora aronka inyungu ki mu vyamutamije?', 'ku vya Rehebiya, mu bana biwe Ishiya ni we yar’ umukuru;', 'Ubupfindo bga mbere bgerekana Yehoyaribu, ubga kabiri Yedaya;', 'na Ahinadabu mwene Ido yatoza ivy’ i Mahanayimu;', 'Nukw Abisirayeli ni kwo kuvuduka i Ramesesi, basagaza i Sukoti.', 'Twakomeye amashi Abanyegiputa, N’Abashuri kugira ngo turonke utwo kurya two kwisama.', 'Bukeye ijambo ry’Uhoraho riza kuri Salomo,', 'Abizeye bose bāba hamwe, basangira ivyabo vyose:', 'Ivyo ni vyo vyiza vyemerwa mu nyonga z’Imana Umukiza wacu,', 'na Obalu na Abimayeli na Sheba', 'n’ abantu ibihumbi mirongwitatu na bibiri vy’abakobga batigeze kumenya icitwa umugabo.', 'Uwemeye ugushinga intahe kwiwe aba ateye igikumu ngw Imana n’iy’ ukuri.', 'Muramukishe bene Data kugumbirana kwera.', 'Abaherezi bakuru bamurega vyinshi.', 'kugira ngo ndariserure nk’uko nkwiye kuvuga.', 'n’i Yeshuwa n’i Molada n’i Beti‐peleti,'




In [37]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

  return self.fget.__get__(instance, owner)()
