In [1]:
from transformers import BertTokenizerFast,BertTokenizer
from datasets import DatasetDict,Dataset,concatenate_datasets

from corpus import Loader,Type
from utils import dataset_util
from utils import Labeler

In [2]:
def make_ready_for_ds(data,corpus_type,labeler,tokenizer,chunk_size=512):
    chars, labels = labeler.label_text(data,corpus_type)
    input_ids_list, labels_list =[],[]
    for i in range(len(chars)):
        tokenized_sentence = [101] +[tokenizer.encode(char)[1] for char in chars[0]]+[102]  
        labels[0].insert(0,0) # Adding CLS token label at the beginning of each sequence
        labels[0].append(0)
        input_ids_list+=tokenized_sentence
        labels_list+=labels[0]
        chars.pop(0)
        labels.pop(0)
    input_ids_list,attention_mask_list,labels_list=dataset_util.chunk_pad_tokens(input_ids_list,labels_list,chunk_size,padd=True)
    # print(len(input_ids_list),len(attention_mask_list),len(labels_list))
    # for i in range(max(len(input_ids_list),len(attention_mask_list),len(labels_list))):
    #     print(len(input_ids_list[i]),len(attention_mask_list[i]),len(labels_list[i]))
    return input_ids_list,attention_mask_list,labels_list


def build_dataset(input_ids_list,attention_mask_list,labels_list,dataset_path):
    data_dict = {
        "input_ids": input_ids_list,
        "labels": labels_list,
        "attention_mask": attention_mask_list,
    }
    # Create a Hugging Face Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)
    # Shuffle the dataset
    dataset = dataset.shuffle(seed=42)
    # Split the dataset into training, validation, and test sets
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))    
    train_dataset = dataset.select(range(train_size))
    val_dataset = dataset.select(range(train_size, train_size + val_size))
    test_dataset = dataset.select(range(train_size + val_size, len(dataset)))   
    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })
    dataset_dict._metadata = {"author": "Matin Ebrahimkhani"}
    dataset_dict.save_to_disk(dataset_path)
    return dataset_dict


def add_data_to_dataset(input_ids_list, attention_mask_list, labels_list,dataset_path,save_path):
    # Load the dataset from disk
    original_ds = DatasetDict.load_from_disk(dataset_path)
    # Create a dictionary from the input lists
    data_dict = {
        "input_ids": input_ids_list,
        "labels": labels_list,
        "attention_mask": attention_mask_list,
    }
    # Create a Hugging Face Dataset from the dictionary
    dataset = Dataset.from_dict(data_dict)
    # Shuffle the dataset
    dataset = dataset.shuffle(seed=42)
    # Split the dataset into training, validation, and test sets
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))    
    train_dataset = dataset.select(range(train_size))
    val_dataset = dataset.select(range(train_size, train_size + val_size))
    test_dataset = dataset.select(range(train_size + val_size, len(dataset)))   
    
    # original_ds["train"] = original_ds["train"].concatenate(train_dataset)
    # original_ds["validation"] = original_ds["validation"].concatenate(val_dataset)
    # original_ds["test"] = original_ds["test"].concatenate(test_dataset)
    
    original_ds["train"] = concatenate_datasets([original_ds["train"],train_dataset])
   
    original_ds["validation"] = concatenate_datasets( [original_ds["validation"], val_dataset])
    original_ds["test"] = concatenate_datasets([original_ds["test"], test_dataset])
    # Save the updated dataset to disk
    original_ds.save_to_disk(save_path)
    return original_ds

def chunk_2d_list(data, num_chunks):
    # Calculate the size of each chunk
    chunk_size = len(data) // num_chunks
    # Create a list of chunks for each input list
    data_chunk = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
   
    # Return the list of chunks for each input list
    return data_chunk

In [3]:

corpus_type=Type.sents_raw
dataset_path="./built_datasets/all.01/"
data = Loader().load_corpus("peykareh",corpus_type=corpus_type,shuffle_sentences=True)
data += Loader().load_corpus("bijankhan",corpus_type=corpus_type,shuffle_sentences=True)

labeler = Labeler()
# pretrained_model = "HooshvareLab/bert-base-parsbert-uncased"
pretrained_model = "bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrained_model)


In [4]:
chunked_data = chunk_2d_list(data,20)

# INITIALIZING THE DATASET
input_ids,attention_mask,labels=make_ready_for_ds(chunked_data[0],corpus_type,labeler,tokenizer,512)
build_dataset(input_ids,attention_mask,labels,dataset_path+'0')

input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/3814 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/476 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/478 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3814
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 476
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 478
    })
})

In [5]:
for i in range(1,len(chunked_data)):
    print("bijankhan part",i)
    input_ids,attention_mask,labels=make_ready_for_ds(chunked_data[i],corpus_type,labeler,tokenizer,512)
    print(add_data_to_dataset(input_ids,attention_mask,labels,dataset_path+f'{i-1}',dataset_path+f'{i}'))
    

bijankhan part 1
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/7666 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/957 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/961 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 7666
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 957
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 961
    })
})
bijankhan part 2
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/11510 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1442 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 11510
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1437
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1442
    })
})
bijankhan part 3
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/15337 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1915 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1921 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 15337
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1915
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1921
    })
})
bijankhan part 4
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/19193 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2397 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2403 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 19193
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 2397
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 2403
    })
})
bijankhan part 5
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/23013 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2874 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2882 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 23013
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 2874
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 2882
    })
})
bijankhan part 6
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/26884 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3357 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3367 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 26884
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3357
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3367
    })
})
bijankhan part 7
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/30713 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3835 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3847 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 30713
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3835
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 3847
    })
})
bijankhan part 8
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/34575 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4317 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4331 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 34575
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 4317
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 4331
    })
})
bijankhan part 9
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/38416 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4797 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4812 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 38416
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 4797
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 4812
    })
})
bijankhan part 10
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/42281 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5280 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5296 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 42281
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 5280
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 5296
    })
})
bijankhan part 11
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/46154 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5764 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5781 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 46154
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 5764
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 5781
    })
})
bijankhan part 12
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/49991 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6243 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6262 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 49991
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 6243
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 6262
    })
})
bijankhan part 13
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/53815 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6721 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6741 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 53815
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 6721
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 6741
    })
})
bijankhan part 14
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/57641 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7199 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7220 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 57641
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 7199
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 7220
    })
})
bijankhan part 15
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/61451 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7675 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7697 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 61451
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 7675
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 7697
    })
})
bijankhan part 16
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/65262 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8151 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8174 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 65262
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 8151
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 8174
    })
})
bijankhan part 17
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/69013 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8619 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8644 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 69013
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 8619
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 8644
    })
})
bijankhan part 18
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/1 shards):   0%|          | 0/72801 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9092 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9118 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 72801
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9092
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9118
    })
})
bijankhan part 19
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/2 shards):   0%|          | 0/76559 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9561 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9589 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 76559
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9561
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9589
    })
})
bijankhan part 20
input data is in list format, processing it as in a list of sentences


Saving the dataset (0/2 shards):   0%|          | 0/76559 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9561 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9590 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 76559
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9561
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 9590
    })
})
