In [1]:
import json
import gzip

In [8]:
for lang in ["cs", "de", "en", "es", "fi", "fr", "it", "nl", "pl", "pt", "sv"]:
    data = json.load(gzip.open(f'data/{lang}.json.gz', 'rt'))
    print(lang, len(data))

cs 67017
de 67017
en 67017
es 67017
fi 67017
fr 67017
it 67017
nl 67017
pl 67017
pt 67017
sv 67017


In [7]:
data = json.load(gzip.open(f'bactrian/cs.json.gz', 'rt'))

In [2]:
for lang in ["cs", "fi", "bg"]:
    data = json.load(open(f'alpaca/alpaca_data_cleaned.{lang}.json'))
    print(lang, len(data))

cs 51760
fi 51760
bg 51760


In [2]:
import pandas as pd

In [3]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import pandas as pd
import gzip

def json_to_parquet(json_path, parquet_path):
    # Load the JSON data
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Create a DataFrame
    df = pd.DataFrame(data, columns=["instruction", "input", "output"]).sample(frac=0.3)

    # Save as Parquet
    df.to_parquet(parquet_path, index=False)


In [3]:
for lang in ['bg', 'cs', 'fi']:
    json_to_parquet(f'/scratch/cs/small_lm/sft/alpaca/alpaca_data_cleaned.{lang}.json',
                    f'/scratch/cs/small_lm/sft/processed/alpaca_{lang}.parquet')

In [5]:
import os
import pyarrow.parquet as pq

def count_rows_in_parquet_files(folder_path):
    print(f"{'File':<40} {'Rows':>10}")
    print("-" * 52)
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".parquet"):
            file_path = os.path.join(folder_path, filename)
            try:
                metadata = pq.ParquetFile(file_path).metadata
                num_rows = metadata.num_rows
                print(f"{filename:<40} {num_rows:>10}")
            except Exception as e:
                print(f"{filename:<40} ERROR: {e}")

In [6]:
count_rows_in_parquet_files('/scratch/cs/small_lm/sft/processed')

File                                           Rows
----------------------------------------------------
bactrian_fr.parquet                           10053
bactrian_pl.parquet                           10053
bactrian_pt.parquet                           10053
bactrian_de.parquet                           10053
alpaca_bg.parquet                             15528
alpaca_fi.parquet                             15528
bactrian_sv.parquet                           10053
bactrian_fi.parquet                           10053
bactrian_nl.parquet                           10053
bactrian_en.parquet                           10053
bactrian_it.parquet                           10053
bactrian_es.parquet                           10053
alpaca_cs.parquet                             15528
bactrian_cs.parquet                           10053


In [3]:
def json_gz_to_parquet(json_gz_path, parquet_path):
    # Load JSON data from a gzip file
    with gzip.open(json_gz_path, 'rt', encoding='utf-8') as f:
        data = json.load(f)

    data = [{k: d[k] for k in ['instruction', 'input', 'output']} for d in data]

    df = pd.DataFrame(data).sample(frac=0.15)

    # Save as Parquet
    df.to_parquet(parquet_path, index=False)

In [4]:
for lang in ["cs", "de", "en", "es", "fi", "fr", "it", "nl", "pl", "pt", "sv"]:
    json_gz_to_parquet(f'/scratch/cs/small_lm/sft/bactrian/{lang}.json.gz',
                    f'/scratch/cs/small_lm/sft/processed/bactrian_{lang}.parquet')

In [10]:
df = pd.read_parquet('/scratch/cs/small_lm/sft/aya_collection_language_split/bulgarian/validation-00000-of-00001.parquet')

In [7]:
df = pd.read_parquet('processed/aya_bg.parquet')

In [16]:
instructions = df['instruction'].values

In [10]:
from transformers import AutoTokenizer


tokenizer_path = "/scratch/cs/small_lm/ConvertedTokenizer"  # Replace with your actual path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
lens = []
for i in instructions:
    lens.append(len(tokenizer.tokenize(i)))

Token indices sequence length is longer than the specified maximum sequence length for this model (11024 > 2048). Running this sequence through the model will result in indexing errors


In [19]:
df['tokens_inst'] = df['instruction'].apply(lambda x: len(tokenizer.tokenize(x)))

In [20]:
df['tokens_out'] = df['output'].apply(lambda x: len(tokenizer.tokenize(x)))

In [21]:
df['sum'] = df['tokens_out'] + df['tokens_inst']

In [26]:
df['sum'].min()

9

In [29]:
df[df['sum'] > 2000]

Unnamed: 0,instruction,output,input,tokens_inst,tokens_out,sum
2708,Имам въпрос за теб: No. No. No. в сезона Назва...,В заключение се разкрива вторият член на екипа...,,11024,30,11054
4054,"Колежи, които преразглеждат стойността на помо...","Първо. Вашингтон, окръг Колумбия Второ. Универ...",,2087,207,2294
4299,"Напиши статия, базирана на това резюме: Разбе...","Бадминтонът, подобно на тениса, е спорт с раке...",,70,2138,2208
4394,Да обобщим тази статия: Кранкбайтовете са най...,"Риба с примамки, когато искаш да покриеш много...",,2063,90,2153
36194,Като се имат предвид тези параграфи за Големит...,Около 2018 г.,,2417,9,2426


In [13]:
import numpy as np

In [15]:
np.mean(lens)

73.285

In [1]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm
from fasttext.FastText import _FastText
from datatrove.io import cached_asset_path_or_download

  from .autonotebook import tqdm as notebook_tqdm


In [21]:

langs = ['it', 'pl', 'fi', 'es', 'pt', 'fr', 'sv'] #'bg', 'cs', 'nl', 'de', 'en', 'el', 

for LANG in langs:
    print(LANG)
    model_file = cached_asset_path_or_download(
        "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
        namespace="lid",
        subfolder="ft176",
        desc="fast-text language identifier model",
    )
    model = _FastText(model_file)

    # Config
    parquet_path = f"/scratch/cs/small_lm/sft/processed/aya_{LANG}.parquet"  # replace with your actual file
    tokenizer_path = "/scratch/cs/small_lm/ConvertedTokenizer"  # path to tokenizer folder
    max_tokens = 2000

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Special tokens (assumes already added in tokenizer config)
    BOS = "<|start_of_sequence|>"
    EOS = "<|end_of_sequence|>"
    IM_START = "<|im_start|>"
    IM_END = "<|im_end|>"

    # Load parquet
    df = pd.read_parquet(parquet_path)

    # Fill NaNs with empty strings for safety
    df = df.fillna("")

    # Construct the prompt format
    def format_row(row):
        instruction_part = row["instruction"]
        input_part = row["input"]
        output_part = row["output"]
        
        if input_part:
            formatted = (
                f"{BOS}{IM_START}\n"
                f"{instruction_part}\n{input_part}\n"
                f"{IM_END}\n"
                f"{output_part}"
                f"{EOS}"
            )
        else:
            formatted = (
                f"{BOS}{IM_START}\n"
                f"{instruction_part}\n"
                f"{IM_END}\n"
                f"{output_part}"
                f"{EOS}"
            )
        return formatted

    # Apply formatting
    tqdm.pandas(desc="Formatting rows")
    df["text"] = df.progress_apply(format_row, axis=1)

    # Tokenize and count tokens
    print("Before preprocessing:", str(df.shape[0]))
    tqdm.pandas(desc="Counting tokens")
    df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))
    df = df[df['num_tokens'] < 2000]

    # Filter not full instructions
    df.drop(df[(df['dataset_name'].isin(['Xlel_wd-inst (T)', 'HotpotQA (T)'])) & (df['instruction'].str.endswith(':'))].index, inplace=True)

    # Bad quality data
    df = df[df['dataset_name'] != 'NQ-Open (T)']

    df['lang'] = df['instruction'].apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])
    df = df[df.lang == f'__label__{LANG}']


    # # Drop the num_tokens column if not needed
    df = df.drop(columns=["num_tokens", "lang", "dataset_name"])
    print("After preprocessing:", str(df.shape[0]))
    df.to_parquet(f"/scratch/cs/small_lm/sft/cleaned/aya_{LANG}.parquet" )

# # Save result
# output_path = parquet_path.replace(".parquet", "_formatted.parquet")
# df.to_parquet(output_path, index=False)

# print(f"Processed and saved to {output_path}, rows remaining: {len(df)}")


it


Formatting rows: 100%|██████████| 77817/77817 [00:00<00:00, 95817.23it/s] 


Before preprocessing: 77817


Counting tokens:   3%|▎         | 2593/77817 [00:03<00:47, 1584.84it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9341 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 77817/77817 [00:25<00:00, 3097.92it/s]


After preprocessing: 72809
pl


Formatting rows: 100%|██████████| 76830/76830 [00:00<00:00, 87866.05it/s] 


Before preprocessing: 76830


Counting tokens:   3%|▎         | 2540/76830 [00:03<00:48, 1541.45it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9809 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 76830/76830 [00:22<00:00, 3454.83it/s]


After preprocessing: 72360
fi


Formatting rows: 100%|██████████| 78800/78800 [00:00<00:00, 99678.31it/s] 


Before preprocessing: 78800


Counting tokens:   3%|▎         | 2258/78800 [00:03<01:37, 783.78it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2355 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 78800/78800 [00:23<00:00, 3422.54it/s]


After preprocessing: 73797
es


Formatting rows: 100%|██████████| 77458/77458 [00:00<00:00, 95423.87it/s] 


Before preprocessing: 77458


Counting tokens:   3%|▎         | 2378/77458 [00:03<01:08, 1099.03it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9286 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 77458/77458 [00:24<00:00, 3134.87it/s]


After preprocessing: 72909
pt


Formatting rows: 100%|██████████| 75722/75722 [00:00<00:00, 99950.37it/s] 


Before preprocessing: 75722


Counting tokens:   4%|▎         | 2697/75722 [00:03<00:41, 1767.59it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9531 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 75722/75722 [00:23<00:00, 3235.08it/s]


After preprocessing: 70785
fr


Formatting rows: 100%|██████████| 85702/85702 [00:00<00:00, 98387.78it/s] 


Before preprocessing: 85702


Counting tokens:   1%|          | 503/85702 [00:00<00:31, 2678.28it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9681 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 85702/85702 [00:45<00:00, 1888.05it/s]


After preprocessing: 79447
sv


Formatting rows: 100%|██████████| 72653/72653 [00:00<00:00, 94840.32it/s] 


Before preprocessing: 72653


Counting tokens:   3%|▎         | 2221/72653 [00:03<01:42, 687.01it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2111 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 72653/72653 [00:22<00:00, 3266.29it/s]


After preprocessing: 68110


In [122]:
df = pd.read_parquet('/scratch/cs/small_lm/sft/processed/alpaca_bg.parquet')

In [17]:
df.iloc[11588]['output']

'Ето един от начините да обърнете реда на масив, използвайки JavaScript: \n\n?js\nconst arr [3, 2, 1];\nconst reversedArr и arr.reverse();\n\nconsole.log(reversedArr); // [1, 2, 3]\n. . .'

In [19]:
df[df.instruction.str.contains('код')]

Unnamed: 0,instruction,input,output,text
2,Рефактор на следния код.,"num_list [1, 2, 3, 4, 5]\nза брой в num_list:\...","Ето един от начините да рефакторирате кода, за...",<|start_of_sequence|><|im_start|>\nРефактор на...
6,Напишете код за съхранение и достъп до стойнос...,,"Ето един пример за Python код, който изпълнява...",<|start_of_sequence|><|im_start|>\nНапишете ко...
23,Как можем да вдъхновим повече хора да станат е...,,За да вдъхновим хората да станат експерти по к...,<|start_of_sequence|><|im_start|>\nКак можем д...
88,Създаване на кодекс за поведение за младежки с...,,Кодексът за поведение очертава стандартите на ...,<|start_of_sequence|><|im_start|>\nСъздаване н...
155,Създайте предизвикателство за кодиране за прог...,,Козилно предизвикателство: Последователност на...,<|start_of_sequence|><|im_start|>\nСъздайте пр...
...,...,...,...,...
15332,"Генерирайте код, за да добавите фонов градиент...",,"Ето примерен код, който добавя линеен фонов гр...",<|start_of_sequence|><|im_start|>\nГенерирайте...
15363,Пренаписване на кода без използване на за цикъл,"списък 1 [1, 2, 3]\nсписък 2 и [ ]\nза i в спи...",Можете да изпълните същата задача по по-кратък...,<|start_of_sequence|><|im_start|>\nПренаписван...
15431,Пренаписване на следната част от кода в по-чет...,ако (i?? ? ? ? ? 1-y;if(z)x? 3;else x ? 4;,Ето една по-четлива версия на предоставения ко...,<|start_of_sequence|><|im_start|>\nПренаписван...
15447,Разработване на обучителна програма за препода...,,Добре дошли в нашата програма за обучение по к...,<|start_of_sequence|><|im_start|>\nРазработван...


In [None]:

LANG = 'cs'

model_file = cached_asset_path_or_download(
    "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
    namespace="lid",
    subfolder="ft176",
    desc="fast-text language identifier model",
)
model = _FastText(model_file)

# Config
parquet_path = f"/scratch/cs/small_lm/sft/processed/alpaca_{LANG}.parquet"  # replace with your actual file
tokenizer_path = "/scratch/cs/small_lm/ConvertedTokenizer"  # path to tokenizer folder
max_tokens = 2000

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Special tokens (assumes already added in tokenizer config)
BOS = "<|start_of_sequence|>"
EOS = "<|end_of_sequence|>"
IM_START = "<|im_start|>"
IM_END = "<|im_end|>"

# Load parquet
df = pd.read_parquet(parquet_path)

# Fill NaNs with empty strings for safety
df = df.fillna("")

# Construct the prompt format
def format_row(row):
    instruction_part = row["instruction"]
    input_part = row["input"]
    output_part = row["output"]
    
    if input_part:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n{input_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    else:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    return formatted


# Apply formatting
tqdm.pandas(desc="Formatting rows")
df["text"] = df.progress_apply(format_row, axis=1)

# Tokenize and count tokens
print("Before preprocessing:", str(df.shape[0]))
tqdm.pandas(desc="Counting tokens")
df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))
df = df[df['num_tokens'] < 2000]
df = df.drop(columns=["num_tokens"])
df = df[~df.instruction.str.contains('kód')]
df.to_parquet(f"/scratch/cs/small_lm/sft/cleaned/alpaca_{LANG}.parquet" )
print("After preprocessing:", str(df.shape[0]))


Formatting rows: 100%|██████████| 15528/15528 [00:00<00:00, 86570.85it/s]


Before preprocessing: 15528


Counting tokens:  86%|████████▌ | 13311/15528 [00:07<00:01, 1974.69it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2338 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 15528/15528 [00:08<00:00, 1734.62it/s]


After preprocessing: 15373


In [38]:
print(df.loc[15155]['output'])

„“
def kalkulačka (num1, num2, obsluha):
    pokud operátor == „+“:
        zpětné číslice 1 + num2
    operátor elif == „-“:
        zpětné ot.:1 – num2
    operátor elif == „*“:
        zpětné ot.:1 * num2
    operátor elifu == „/“:
        návratová num1 / num2
    další:
        návrat Žádný

# Příklad:
print(kalkulační kalkulačka(5, 2,*))) # 10
„“
Toto je základní funkce kalkulačky v jazyce Python. Můžete mu předat dvě čísla a operátor a vrátí se mu výsledek výpočtu. Podporovanými operátory jsou +, -, * a /. Pokud je předán nepodporovaný operátor, funkce se vrátí None.


In [36]:
df[df.instruction.str.contains('kód')]

Unnamed: 0,instruction,input,output,text
8,"Vytvořte kód, který vytiskne 10x10 násobící ta...",,"Zde je kód Pythonu, který může generovat násob...",<|start_of_sequence|><|im_start|>\nVytvořte kó...
59,Vytvořte unikátní pětimístný kód.,,Zde je unikátní pětimístný kód: 87349,<|start_of_sequence|><|im_start|>\nVytvořte un...
207,Generuje se vzorek platného kódu HTML.,,Zde je ukázka platného HTML kódu: \n\n„`html\n...,<|start_of_sequence|><|im_start|>\nGeneruje se...
415,Generovat úryvek kódu pro třídění pole ve vzes...,,"Jasně! Zde je úryvek kódu v jazyce Python, kte...",<|start_of_sequence|><|im_start|>\nGenerovat ú...
624,Generování HTML kódu pro tlačítko.,,Zde je příklad jednoduchého HTML kódu pro vytv...,<|start_of_sequence|><|im_start|>\nGenerování ...
...,...,...,...,...
14696,"Jaké jsou některé způsoby, jak se rychle nauči...",,"Existuje několik účinných způsobů, jak se rych...",<|start_of_sequence|><|im_start|>\nJaké jsou n...
14739,Aktualizujte kód pro opravu chyby.,a = 6\nb = 5\nc = a + b\ntisk(d),"Chyba v tomto kódu je, že proměnná 'd' není de...",<|start_of_sequence|><|im_start|>\nAktualizujt...
14910,"Opravit kód tak, aby běhal bez vytvoření chyby",veřejná třída Demo {\n veřejné statické voi...,"Zde je opravená verze kódu, která bude běžet b...",<|start_of_sequence|><|im_start|>\nOpravit kód...
15152,Jaký je obecný postup psaní kódu?,,Obecný proces psaní kódu zahrnuje několik krok...,<|start_of_sequence|><|im_start|>\nJaký je obe...


In [45]:
for LANG in ["cs", "de", "en", "es", "fi", "fr", "it", "nl", "pl", "pt", "sv"]:
    model_file = cached_asset_path_or_download(
        "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
        namespace="lid",
        subfolder="ft176",
        desc="fast-text language identifier model",
    )
    model = _FastText(model_file)

    # Config
    parquet_path = f"/scratch/cs/small_lm/sft/processed/bactrian_{LANG}.parquet"  # replace with your actual file
    tokenizer_path = "/scratch/cs/small_lm/ConvertedTokenizer"  # path to tokenizer folder
    max_tokens = 2000

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Special tokens (assumes already added in tokenizer config)
    BOS = "<|start_of_sequence|>"
    EOS = "<|end_of_sequence|>"
    IM_START = "<|im_start|>"
    IM_END = "<|im_end|>"

    # Load parquet
    df = pd.read_parquet(parquet_path)

    # Fill NaNs with empty strings for safety
    df = df.fillna("")

    # Construct the prompt format
    def format_row(row):
        instruction_part = row["instruction"]
        input_part = row["input"]
        output_part = row["output"]
        
        if input_part:
            formatted = (
                f"{BOS}{IM_START}\n"
                f"{instruction_part}\n{input_part}\n"
                f"{IM_END}\n"
                f"{output_part}"
                f"{EOS}"
            )
        else:
            formatted = (
                f"{BOS}{IM_START}\n"
                f"{instruction_part}\n"
                f"{IM_END}\n"
                f"{output_part}"
                f"{EOS}"
            )
        return formatted


    # Apply formatting
    tqdm.pandas(desc="Formatting rows")
    df["text"] = df.progress_apply(format_row, axis=1)

    # Tokenize and count tokens
    print("Before preprocessing:", str(df.shape[0]))
    tqdm.pandas(desc="Counting tokens")
    df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))
    df = df[df['num_tokens'] < 2000]
    df = df.drop(columns=["num_tokens"])

    df.to_parquet(f"/scratch/cs/small_lm/sft/cleaned/bactrian_{LANG}.parquet" )
    print("After preprocessing:", str(df.shape[0]))

Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 84995.17it/s]


Before preprocessing: 10053


Counting tokens:   8%|▊         | 791/10053 [00:00<00:04, 2054.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2909 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2107.06it/s]


After preprocessing: 10043


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 89511.61it/s]


Before preprocessing: 10053


Counting tokens:   7%|▋         | 710/10053 [00:00<00:05, 1614.62it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4452 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:05<00:00, 1862.66it/s]


After preprocessing: 10043


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 84328.82it/s]


Before preprocessing: 10053


Counting tokens:   6%|▌         | 582/10053 [00:00<00:04, 1995.78it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2467 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:05<00:00, 1772.80it/s]


After preprocessing: 10044


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 101053.15it/s]


Before preprocessing: 10053


Counting tokens:   2%|▏         | 206/10053 [00:00<00:04, 2039.52it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3199 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2041.45it/s]


After preprocessing: 10039


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 95469.48it/s]


Before preprocessing: 10053


Counting tokens:  13%|█▎        | 1293/10053 [00:00<00:04, 2132.27it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2781 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2188.07it/s]


After preprocessing: 10041


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 84156.64it/s]


Before preprocessing: 10053


Counting tokens:   4%|▎         | 362/10053 [00:00<00:05, 1786.63it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2576 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2087.04it/s]


After preprocessing: 10039


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 93392.56it/s]


Before preprocessing: 10053


Counting tokens:   7%|▋         | 702/10053 [00:00<00:04, 2313.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2485 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2271.86it/s]


After preprocessing: 10042


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 98010.38it/s]


Before preprocessing: 10053


Counting tokens:   0%|          | 0/10053 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2535 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2112.04it/s]


After preprocessing: 10042


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 91404.27it/s]


Before preprocessing: 10053


Counting tokens:   6%|▋         | 633/10053 [00:00<00:04, 2147.34it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3918 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2182.45it/s]


After preprocessing: 10044


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 102460.93it/s]


Before preprocessing: 10053


Counting tokens:  11%|█         | 1126/10053 [00:00<00:04, 2222.41it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2542 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2160.01it/s]


After preprocessing: 10044


Formatting rows: 100%|██████████| 10053/10053 [00:00<00:00, 102338.33it/s]


Before preprocessing: 10053


Counting tokens:   2%|▏         | 194/10053 [00:00<00:05, 1936.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2591 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 10053/10053 [00:04<00:00, 2298.94it/s]


After preprocessing: 10039


In [3]:
import json

In [5]:
with open('/scratch/cs/small_lm/sft/lima/train.jsonl') as fh:
    lines = [json.loads(line.strip()) for line in fh.readlines()]

In [14]:
BOS = "<|start_of_sequence|>"
EOS = "<|end_of_sequence|>"
IM_START = "<|im_start|>"
IM_END = "<|im_end|>"

def format_row(row):
    instruction_part = row[0]
    input_part = None
    output_part = row[1]
    
    if input_part:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n{input_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    else:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    return formatted

In [26]:
texts = [format_row(row['conversations']) for row in lines]
instructions = [row['conversations'][0] for row in lines]
outputs = [row['conversations'][1] for row in lines]
inputs = ['' for row in lines]

In [32]:
df = pd.DataFrame({'instruction': instructions, 'input': inputs, 
                   'output': outputs, 'text' : texts})

In [33]:
tokenizer_path = "/scratch/cs/small_lm/ConvertedTokenizer"  # path to tokenizer folder
max_tokens = 2000

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

tqdm.pandas(desc="Counting tokens")
df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))

Counting tokens:  17%|█▋        | 171/1030 [00:00<00:01, 851.49it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2584 > 2048). Running this sequence through the model will result in indexing errors
Counting tokens: 100%|██████████| 1030/1030 [00:01<00:00, 697.32it/s]


In [34]:
df = df[df["num_tokens"] <= 2048]

In [36]:
df.drop(columns=['num_tokens'], inplace=True)

In [37]:
df.to_parquet('/scratch/cs/small_lm/sft/cleaned/lima_en.parquet', index=False)

In [10]:
for line in lines:
    if len(line['conversations']) != 2:
        print(len(line['conversations']))

4
6
4
4
6
20
8
8
6
8
6
4
4
6
6
4
4
6
4
6
4
5
4
6
4
4
4
4
4
6


In [40]:
df = pd.read_parquet('./code/train-00000-of-00001.parquet')

In [42]:
df = df[['instruction', 'response']]

In [45]:
df.rename(columns={'response':'output'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'response':'output'}, inplace=True)


In [47]:
df['input'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['input'] = ''


In [48]:
def format_row(row):
    instruction_part = row["instruction"]
    input_part = row["input"]
    output_part = row["output"]
    
    if input_part:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n{input_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    else:
        formatted = (
            f"{BOS}{IM_START}\n"
            f"{instruction_part}\n"
            f"{IM_END}\n"
            f"{output_part}"
            f"{EOS}"
        )
    return formatted


# Apply formatting
tqdm.pandas(desc="Formatting rows")
df["text"] = df.progress_apply(format_row, axis=1)

# Tokenize and count tokens
print("Before preprocessing:", str(df.shape[0]))
tqdm.pandas(desc="Counting tokens")
df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))

Formatting rows: 100%|██████████| 50661/50661 [00:00<00:00, 96199.18it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df.progress_apply(format_row, axis=1)


Before preprocessing: 50661


Counting tokens: 100%|██████████| 50661/50661 [00:30<00:00, 1642.15it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_tokens"] = df["text"].progress_apply(lambda x: len(tokenizer.encode(x)))


In [52]:
print(df['text'].iloc[4])

<|start_of_sequence|><|im_start|>
Write a Python function that takes two lists `A` and `B`, each representing a matrix, and returns their sum in the form of a new list. The matrices are of the same size, and each element of the output matrix is the sum of the corresponding elements in `A` and `B`.
<|im_end|>
You can create a Python function that takes two lists `A` and `B` as input. Both lists represent matrices of the same size. Each element of the output matrix is the sum of the corresponding elements in `A` and `B`.

Here's how you could do it:

```python
def matrix_sum(A, B):
    rows = len(A)
    cols = len(A[0])

    C = []
    for i in range(rows):
        row = []
        for j in range(cols):
            row.append(A[i][j] + B[i][j])
        C.append(row)

    return C
```<|end_of_sequence|>


In [53]:
df = df[df['num_tokens'] < 2000]

In [55]:
df.drop(columns=['num_tokens'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['num_tokens'], inplace=True)


In [56]:
df.to_parquet('/scratch/cs/small_lm/sft/cleaned/bigcode_code.parquet', index=False)

In [42]:
df['lang'] = df['instruction'].apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

In [43]:
df[df['lang'] != '__label__sv']

Unnamed: 0,instruction,input,output,text,num_tokens,lang
42,Lista tre egenskaper hos en effektiv ledare.,,1. Visionär: En effektiv ledare har en klar oc...,<|start_of_sequence|><|im_start|>\nLista tre e...,205,__label__no
70,Lista ut minst 5 stadier av sorg.,,1. Förnekelse: Det första stadiet av sorg kan ...,<|start_of_sequence|><|im_start|>\nLista ut mi...,221,__label__no
79,Lista elementen i binomialnomenklaturen.,,Binomialnomenklaturen består av två delar:\n\n...,<|start_of_sequence|><|im_start|>\nLista eleme...,100,__label__es
96,Räkna ut 5 plus 2,,7,<|start_of_sequence|><|im_start|>\nRäkna ut 5 ...,19,__label__fr
157,Klassificera dessa quarterbackar som antingen ...,,Joe Montana och John Elway har vunnit Superbow...,<|start_of_sequence|><|im_start|>\nKlassificer...,64,__label__no
...,...,...,...,...,...,...
9823,Identifiera alla adjektiv i denna mening,Den klarblå himlen lugnade,"två adjektiv: ""klarblå"" och ""lugnade"".",<|start_of_sequence|><|im_start|>\nIdentifiera...,44,__label__it
9866,Kategorisera denna produkt i rätt bransch.,Den nya AppPlatform kommer att tillåta utveckl...,Bransch: Teknik/IT.,<|start_of_sequence|><|im_start|>\nKategoriser...,53,__label__id
9892,Beskriv Lie Algebra,I matematik är en enkel Lie-grupp en sammankop...,Lie-algebror är en skattejakt relaterad till L...,<|start_of_sequence|><|im_start|>\nBeskriv Lie...,538,__label__no
9895,Generera utdata givet programkoden.,num1 = 2\nnum2 = 4\nprint(nummer1 + nummer2),Det kommer att bli ett fel eftersom variabelna...,<|start_of_sequence|><|im_start|>\nGenerera ut...,116,__label__no


In [99]:
df = df[(df['num_tokens'] >= 40)]

In [100]:
df

Unnamed: 0,instruction,output,input,text,num_tokens
0,Cosa avrebbe cambiato il risultato delle elezi...,"Nella loro prima riunione dopo le elezioni, i ...",,<|start_of_sequence|><|im_start|>system\nYou a...,251
1,Qual è lo scopo dell'arsenale nucleare Data la...,L'attacco nucleare è la capacità delle forze n...,,<|start_of_sequence|><|im_start|>system\nYou a...,354
2,I binari dannosi sono venuti da? Dato il conte...,"Nell'ottobre 2013, la società ha annunciato la...",,<|start_of_sequence|><|im_start|>system\nYou a...,126
3,Qual è la dimensione della nube di inquinament...,Una densa ondata di smog è iniziata nella part...,,<|start_of_sequence|><|im_start|>system\nYou a...,532
4,Qual è il primo motivo elencato per i dati di ...,"L'hardware, il software e le specifiche di ret...",,<|start_of_sequence|><|im_start|>system\nYou a...,137
...,...,...,...,...,...
77812,"Completa la seguente frase: Inizialmente, dura...",GP di Spagna dove Pérez e Ocon hanno finito ri...,,<|start_of_sequence|><|im_start|>system\nYou a...,439
77813,Scrivi una continuazione per questo paragrafo ...,"Nel 1995 Vinogradov, che è stato consigliere d...",,<|start_of_sequence|><|im_start|>system\nYou a...,215
77814,Completa la seguente frase: Il congresso è sta...,dichiarò l'indipendenza delle Province Unite d...,,<|start_of_sequence|><|im_start|>system\nYou a...,139
77815,Completa la seguente frase: Le riprese princip...,"Toronto International Film Festival, e ha iniz...",,<|start_of_sequence|><|im_start|>system\nYou a...,242


In [101]:
from datatrove.io import cached_asset_path_or_download

In [None]:
from fasttext.FastText import _FastText
from datatrove.io import cached_asset_path_or_download

model_file = cached_asset_path_or_download(
    "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
    namespace="lid",
    subfolder="ft176",
    desc="fast-text language identifier model",
)
model = _FastText(model_file)

In [65]:
model.predict('Hello world')[0][0]

'__label__en'

In [102]:
df['lang'] = df['instruction'].apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

In [103]:
df[df.lang != '__label__it']

Unnamed: 0,instruction,output,input,text,num_tokens,lang
3534,"John Wayne, Maureen O' Hara, Barry Fitzgerald,...",La domanda è: The Quiet Man è un film del 1952...,,<|start_of_sequence|><|im_start|>system\nYou a...,262,__label__en
3670,"Domanda: Se P (A) = 2/15, P (B) = 4/15, e P (A...",P (B) = P (A) /P (A) PB ((((A) = (6/15) / (((2...,,<|start_of_sequence|><|im_start|>system\nYou a...,152,__label__en
5560,In quale contea si trova l'Aviation Hall of Fa...,Contea di Bergen,,<|start_of_sequence|><|im_start|>system\nYou a...,49,__label__en
5720,"Chi e' piu' grande, Khloé Kardashian o George ...",George Aghajanian,,<|start_of_sequence|><|im_start|>system\nYou a...,47,__label__en
6026,"Chi e' piu' grande, Nick Cave o Elize Ryd?",Nick Cave,,<|start_of_sequence|><|im_start|>system\nYou a...,42,__label__en
...,...,...,...,...,...,...
67301,Complete the following phrase: 1991: Francesc...,2006: Sydney Pollack - 2007: Martin Scorsese -...,,<|start_of_sequence|><|im_start|>system\nYou a...,253,__label__de
67321,Write a continuation for this paragraph - Nell...,"figurava nella rosa dell'Andrea Doria, sodaliz...",,<|start_of_sequence|><|im_start|>system\nYou a...,72,__label__en
67328,Write a continuation for this paragraph - In o...,", svolto a Bruxelles e a Londra tra luglio e a...",,<|start_of_sequence|><|im_start|>system\nYou a...,123,__label__en
71999,Completare la frase seguente: Sucedió en Jalis...,Cristeros) è un film messicano del 1947 scritt...,,<|start_of_sequence|><|im_start|>system\nYou a...,114,__label__es


In [None]:
df = df[df.lang == '__label__sv']

In [74]:
df.shape

(71350, 6)

In [1]:
import pandas as pd

In [6]:
df = pd.read_parquet('cleaned/aya_bg.parquet')

In [9]:
df.iloc[0].text

'<|start_of_sequence|><|im_start|>\nКакво би променило резултата от изборите? Като се има предвид предишния въпрос, напишете контекст, който съдържа отговора. Може да е от 1 до 20 изречения. Контекст:\n<|im_end|>\nНа първото си заседание след изборите членовете на парламента на групата решават да приемат официално името "Лейбъристката партия" (15 февруари 1906 г.). Киър Харди, който е изиграл водеща роля в създаването на партията, е избран за председател на Парламентарната лейбъристка партия (в действителност, лидерът), въпреки че само с един глас над Дейвид Шакълтън след няколко гласувания. В ранните години на партията Независимата работна партия (НЛП) осигурява голяма част от активистката си база, тъй като партията няма индивидуални членове до 1918 г., но действа като конгломерат от свързани органи. Фабианското общество осигури голяма част от интелектуалния стимул за партията. Едно от първите действия на новото либерално правителство беше да отмени решението на Таф Вейл.<|end_of_sequ

In [1]:
import os
import pandas as pd

# Define the folder with parquet files
input_folder = "/scratch/cs/small_lm/sft/cleaned"
output_folder = os.path.join(input_folder, "subsample")

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Loop over all parquet files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".parquet"):
        file_path = os.path.join(input_folder, filename)
        print(f"Processing {file_path}...")

        # Read the Parquet file
        df = pd.read_parquet(file_path)

        # Sample 10% of the data (without replacement)
        df_sampled = df.sample(frac=0.1, random_state=42)

        # Save to the sampled subfolder with the same name
        output_path = os.path.join(output_folder, filename)
        df_sampled.to_parquet(output_path, index=False)

        print(f"Saved sampled file to {output_path}")


Processing /scratch/cs/small_lm/sft/cleaned/aya_bg.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/aya_bg.parquet
Processing /scratch/cs/small_lm/sft/cleaned/bactrian_fr.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/bactrian_fr.parquet
Processing /scratch/cs/small_lm/sft/cleaned/aya_fi.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/aya_fi.parquet
Processing /scratch/cs/small_lm/sft/cleaned/bactrian_pl.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/bactrian_pl.parquet
Processing /scratch/cs/small_lm/sft/cleaned/bactrian_pt.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/bactrian_pt.parquet
Processing /scratch/cs/small_lm/sft/cleaned/aya_de.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/subsample/aya_de.parquet
Processing /scratch/cs/small_lm/sft/cleaned/aya_nl.parquet...
Saved sampled file to /scratch/cs/small_lm/sft/cleaned/s

In [4]:
# Machine translation

In [6]:
dataset = load_dataset("tatoeba", lang1="bg", lang2="en")

Downloading data: 100%|██████████| 520k/520k [00:00<00:00, 14.5MB/s]
Generating train split: 17823 examples [00:00, 35207.23 examples/s]


In [14]:
dataset['train'][1200]

{'id': '1200',
 'translation': {'bg': 'Майк и Кен са приятели.',
  'en': 'Mike and Ken are friends.'}}

In [None]:
translation_prompts = [
    "Translate this text from {source_lang} to {target_lang}.",
    "Can you convert the following from {source_lang} into {target_lang}?",
    "I'd like this translated from {source_lang} to {target_lang}.",
    "Please translate the following content from {source_lang} into {target_lang}.",
    "How do you say this in {target_lang} if it’s originally in {source_lang}?",
    "I need a {target_lang} translation of this {source_lang} text.",
    "Could you help me translate this from {source_lang} to {target_lang}?",
    "Give me the {target_lang} version of this {source_lang} sentence.",
    "Turn this {source_lang} sentence into {target_lang}.",
    "Rewrite this {source_lang} input in {target_lang}.",
    "Provide a {target_lang} translation for the following {source_lang} paragraph.",
    "How would you express this {source_lang} phrase in {target_lang}?",
    "Switch this text from {source_lang} to {target_lang}, please.",
    "I want to understand this {source_lang} message in {target_lang}.",
    "Translate this message out of {source_lang} and into {target_lang}.",
    "Could you please convert this from {source_lang} to {target_lang} language?",
    "Give me a translation from {source_lang} into {target_lang}.",
    "What's the {target_lang} equivalent of this {source_lang} text?",
    "Can I get a {target_lang} translation of what's written here in {source_lang}?",
    "Help me translate from {source_lang} to {target_lang}, please.",
    "Translate from {source_lang} to {target_lang}.",
    "{source_lang} → {target_lang}",
    "From {source_lang} to {target_lang}.",
    "Convert {source_lang} to {target_lang}.",
    "{source_lang} into {target_lang}"
]


In [1]:
import pandas as pd

In [11]:
df = pd.read_parquet('/scratch/cs/small_lm/sft/cleaned/aya_en.parquet')

In [12]:
print(df.text[400])

<|start_of_sequence|><|im_start|>
Shailyn: Hey, John. I haven't seen you in a while. How have you been?

John: Hey, Shailyn. I've been good. How about you?

Shailyn: I'm doing well, thanks for asking. I actually have some big news to share with you.

John: Oh, yeah? What's that?

Shailyn: I've finally quit smoking cigarettes! It's been almost two weeks now and I'm feeling great about it.

John: That's amazing! Congratulations! I'm really proud of you for making that decision and sticking to it.

Shailyn: Thank you, John. It's been a long time coming but I'm glad I finally did it.

John: So, what made you decide to quit?

Shailyn: I think I just reached a point where I was tired of being addicted to something that was slowly killing me. And I realized that I didn't want to smoke for the rest of my life. So, I decided to quit and never look back.

John: That's a really smart decision. And I know it wasn't easy for you to do. But you did it and you should be really proud of yourself.

Sha