# Fine-tune gamma carbonic anhydrases 4.2.1.1

In [9]:
# Read source file
with open('/agh/projects/noelia/NLP/zymCTRL/fine-tune/4.2.1.1/gamma/gamma_filtered_final.fasta', 'r') as fn:
    data = fn.readlines()
    fn.close()   

In [10]:
# Put sequences into dictionary
sequences={}
for line in data:
    if '>' in line:
        name = line.strip()
        sequences[name] = ['4.2.1.1']
        continue
    sequences[name].append(line.strip())

In [11]:
for key,value in sequences.items():
    print(value)

['4.2.1.1', 'MTIYQLGDATPEIDASSYIAESANVIGKVRIQALASIWFGVTIRGDNEYITIGANSNVQE', 'AAVLHTDMGFPMVIGNNVTVGHQAMLHGCTIGDGALIGIGAVVMNGATIGKGCLVGAGAL', 'VTEGKHFDDHMLIIGAPAKAVRPLTSAEITRLEGNADAYVSRGQLFKTQLKKIG']
['4.2.1.1', 'MRCSAARQSRTPRTQEYDRILDIGDPGLSPRLTFDPDDDVAAPYVASGARPRIAILREQG', 'VNGQVEMAAAFDRAGFAAFDVHMSDIIAGRISLADFSGFAACGGFSYGDVLGAGEGWAKS', 'ILFNPRARQEFANFFARPDAFALGVCNGCQMMAALRELIPGAAHWPHFVRNKSEQFEARF', 'VLVEVTRSPSLFFAGMEGSRLPVATAHGEGYAEFHDAAALAAAQPFVALRFVDHRGAATD', 'VYPYNANGSPQGITGLTTADGRFTILMPHPERVFRSVQLSWHPRRAGGEDSPVDAHVQEC', 'AHVAGLTTLCGIGGHRAARIAMTLTVYAIAGITPVVDPTAYVHPTAILIGDVIVGAGVYI', 'GPSASLRGDFGRLEVRAGANIQDNCVLHGFPGTDTIVEEDGHIGHGAVLHGCIVQRNALV', 'GMNAVVNDNAVIGESAMVAAMAFVKAGFIVPPRTLVAGRAGDASCAR']
['4.2.1.1', 'MPLFSFEGRKPSIDESAYVFPNATIIGDVRIGKEVWIGPGAVLRGDYGTIVVGDYSAIED', 'NVVVHARPGETTKIGNHVTVGHLSVIHTPAIADWVVIGMGAVISDFAKVGEWSAIGEGAV', 'VKNNSEIPAESIAVGVPAHVVGKVSQEYKNLWTGYKANYNSFTRRYRNNLLRLDQ']
['4.2.1.1', 'MSAGIYTLGERRLVTAGDDFYIAPGAQVIGSVVLGAGASLWFNCVLRADDERIEVGSGSN', 'VQDGSVIHADPGVPT

In [12]:
# Process fasta files to be in single string
processed_sequences = {}
for name, sequence in sequences.items():
    processed_sequences[f"{sequence[0]};{name}"] = ''.join([x for x in sequence[1:]])

In [13]:
# Pass sequences to list and shuffle their order randomly (important!)
sequences_list = [(key,value) for key,value in processed_sequences.items()]
import random
random.shuffle(sequences_list)

#### grouping

In [14]:
import transformers
from transformers import AutoTokenizer
tokenizer_kwargs = {
        "cache_dir": '.',
        "use_fast": True,
        "revision": "main",
        "use_auth_token": None,
    }
tokenizer = AutoTokenizer.from_pretrained('/agh/projects/noelia/NLP/zymCTRL/dataset_preparation/tokenizer', **tokenizer_kwargs)
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

In [15]:
# the objective is to get here strings, that when tokenized, would span a length of 1024.
# for each sequence group its length and untokenized string
print("procesing dataset")
processed_dataset = []
for i in sequences_list:
    # length of the control code
    label = i[0].split(';')[0]
    sequence = i[1].strip()
    separator = '<sep>'
    control_code_length = len(tokenizer(label+separator)['input_ids'])
    available_space = 1021 - control_code_length # It is not 1024 because '<|endoftext|>', and start and end

    # Option 1: the sequence is larger than the available space (3-4% of sequences)
    if len(sequence) > available_space:
        total_length = control_code_length + len(sequence[:available_space]) + 1
        seq = f"{label}{separator}{sequence[:available_space]}<|endoftext|>"
        processed_dataset.append((total_length, seq))

    # Option 2 & 3: The sequence fits in the block_size space with or without padding
    else:
        total_length = control_code_length + len(sequence) + 3
        # in this case the sequence does not fit with the start/end tokens
        seq = f"{label}{separator}<start>{sequence}<end><|endoftext|>"
        processed_dataset.append((total_length, seq))

procesing dataset


In [16]:
# Group sequences into 1024 chunks
def grouper(iterable):
    prev = None
    group = ''
    total_sum = 0
    for item in iterable:
        if prev is None or item[0] + total_sum < 1025:
            group += item[1]
            total_sum += item[0]
        else:
            total_sum = item[0]
            yield group
            group = item[1]
        prev = item
    if group:
        total_sum = 0
        yield group

In [17]:
print("grouping processed dataset")
grouped_dataset=dict(enumerate(grouper(processed_dataset),1))

grouping processed dataset


In [18]:
grouped_dataset

{1: '4.2.1.1<sep><start>MIGKNVQTDFSSKVCDPVIDPTAYVHPMGAVIGNVIIGKNVFVSPFASVRGDEGQPLHVGDDSNVQDGVIIHALETEEHGKPIEKNLMEVGGKKYAVYVGNRVSLAHQVQIHGPAVVLDDSFIGMKSLVFRAKVGKGCVIEPGCILMGVSVPDGRYVPAGTVLRDQAVADKLPAHHRRIPLKNLNKGVVPREQGPGERVRGTKIVDSRQPTGRKKSPFILPSGEERGIFYSFFEFLLW<end><|endoftext|>4.2.1.1<sep><start>MDLPVPSALGPLWSLDGISPQIAPDAWIAPTAVLIGHVEIGSQASIWFGCILRGDTNLIRIGARSNIQDGSVLHVNVGDGMACLIGEDVTVGHMAIVHAATLHDRAFVAMSAVVLDGAVIETGGVLAAGAVLTPGKRIGAGELWAGTPARLVRVLGEEERAKFAMTAPAYVQNAMRFRAASACAAADSKGSPFSTKKKLNRVPQPQWSQSVSRCNWRKQARFPANPARLAFCVCACAPPAPHTAASAQVSRSGTARTPAPTRARPTLARA<end><|endoftext|>4.2.1.1<sep><start>MTVSSFEGKTPVIGEGAWVHPKAEVIGDVTIGARCWVGPGACVRGDYGTIVLGDCCAVEDNVVVHARPGEKCTIGSWVTLGHGCVVHGVVALGDYAVVGMNAVVSDWAEVGEWGMVAEGAVVPQGAVVPAARVAAGVPARLLEREVDEDYREVWRGFKQVYVGLCDRYREGYSPVPEP<end><|endoftext|>4.2.1.1<sep><start>MAIYQLGEHAPRLDAGAWVAPSASVIGRVELGFDASVWWGAVLRGDNDPLVIGARSNIQDGAVVHTDAGVPMVIGEGVTVGHQVMLHGCTIGDGALVGIQAVVLNGAKIGRNCLVGAGSLVTEGKEFPDGALVMGSPARVVRMLTPGQIAGINGIAQHYIDNARRYAAGLKADRMSSAG

In [19]:
fn = open("./4.2.1.1/gamma/gamma_processed.txt",'w')
for key,value in grouped_dataset.items():
    fn.write(value)
    fn.write("\n")
fn.close()    

In [20]:
fn = open("./4.2.1.1/gamma/gamma_processed.txt",'w')
for key,value in grouped_dataset.items():
    padding_len = 1024 - len(tokenizer(value)['input_ids'])
    padding = "<pad>"*padding_len
    print(len(tokenizer(value+padding)['input_ids']))
    fn.write(value+padding)
    fn.write
    fn.write("\n")
fn.close()    

1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024
1024


#### Tokenization

In [29]:
# adapted from the trainer file
data_files = {}
dataset_args = {}
validation_split_percentage = 10

data_files["train"] = './4.2.1.1/gamma/gamma_processed.txt'
extension = "text"


In [30]:
from datasets import load_dataset
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir='.', **dataset_args)

Using custom data configuration default-1b25e2a964107d86
Reusing dataset text (./text/default-1b25e2a964107d86/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)


  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
raw_datasets["train"] = load_dataset(extension,
                data_files=data_files,
                split=f"train[{validation_split_percentage}%:]",
                cache_dir='.',
                **dataset_args,)

raw_datasets["validation"] = load_dataset(extension,
                                          data_files=data_files,
                                          split=f"train[:{validation_split_percentage}%]",
                                          cache_dir='.',
                                          **dataset_args,)

Using custom data configuration default-1b25e2a964107d86
Reusing dataset text (./text/default-1b25e2a964107d86/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)
Using custom data configuration default-1b25e2a964107d86
Reusing dataset text (./text/default-1b25e2a964107d86/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)


In [32]:
def tokenize_function(examples):
    with CaptureLogger(tok_logger) as cl:
        output = tokenizer(examples["text"])
    # clm input could be much much longer than block_size
    if "Token indices sequence length is longer than the" in cl.out:
        tok_logger.warning(
            "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
        )
    return output

In [33]:
from transformers.testing_utils import CaptureLogger

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=32,
    remove_columns=['text'],
    load_from_cache_file = False,
    desc="Running tokenizer on dataset",
)

                                  

Running tokenizer on dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #4:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #6:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #7:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #9:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #18:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #19:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #20:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #22:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #24:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #25:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #28:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #29:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #31:   0%|          | 0/1 [00:00<?, ?ba/s]

                                  

Running tokenizer on dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #6:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #8:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #9:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #11:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #13:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #15:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #17:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #21:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #23:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #27:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #28:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running tokenizer on dataset #29:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on dataset #30:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #31:   0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

train_dataset.save_to_disk('./4.2.1.1/gamma/dataset/train')
eval_dataset.save_to_disk('./4.2.1.1/gamma/dataset/eval')

### Grouping after tokenization and saving datasets

In [40]:
from datasets import load_from_disk

train_dataset = load_from_disk('./4.2.1.1/gamma/dataset/train')
eval_dataset = load_from_disk('./4.2.1.1/gamma/dataset/eval')

from datasets.dataset_dict import DatasetDict
tokenized_datasets = DatasetDict()

tokenized_datasets["train"] = train_dataset
tokenized_datasets["validation"] = eval_dataset

block_size = 1024
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=124,
    load_from_cache_file=False,
    desc=f"Grouping texts in chunks of {block_size}",
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

train_dataset.save_to_disk('./4.2.1.1/gamma/dataset/train2')
eval_dataset.save_to_disk('./4.2.1.1/gamma/dataset/eval2')

                                                                                                                              

Grouping texts in chunks of 1024 #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #7:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #9:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #12:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #13:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #21:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #23:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #24:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #25:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #28:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #29:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #31:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #32:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #34:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #35:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #37:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #38:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #39:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #40:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Grouping texts in chunks of 1024 #41:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #45:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #42:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #43:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #46:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #44:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #47:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #48:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #49:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #50:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #51:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #52:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #53:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #54:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #55:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #56:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #57:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #58:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #59:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #60:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #61:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #62:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #63:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #64:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #65:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #66:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #67:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #68:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #69:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #70:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #71:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #72:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #73:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #74:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #75:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #76:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #77:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #78:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #79:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #80:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #81:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #82:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #83:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #84:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Grouping texts in chunks of 1024 #85:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #86:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #87:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #88:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #89:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #90:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #91:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #92:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #93:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #94:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #95:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #96:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #97:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #98:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #99:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #100:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #101:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #102:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #103:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #104:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #105:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #106:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #107:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #108:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #109:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #110:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #111:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #112:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #113:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #114:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #115:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #116:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #117:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #118:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #119:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #120:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #121:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #122:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #123:   0%|          | 0/1 [00:00<?, ?ba/s]

num_proc must be <= 35. Reducing num_proc to 35 for dataset of size 35.


                                     

Grouping texts in chunks of 1024 #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #4:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #6:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #7:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #8:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #10:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #11:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #15:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #16:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #20:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #21:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #29:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 1024 #31:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 1024 #32:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #33:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024 #34:   0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 317
})

In [42]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 35
})