In [9]:
import multiprocessing as mp
import datasets

from transformers import AutoTokenizer
from ptvid.constants import DOMAINS

N_PROC = mp.cpu_count()

In [4]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

In [6]:
tokenizer.tokenize("this is an example")

['this', 'Ġis', 'Ġan', 'Ġexample']

In [12]:
def count_tokens(example):
    n_tokens = len(tokenizer.tokenize(example["text"]))
    example["n_tokens"] = n_tokens
    return example

In [14]:
for domain in DOMAINS:
    print(domain)
    dataset = datasets.load_dataset("liaad/PtBrVId", domain)

journalistic
literature
legal
politics
web
social_media


Downloading data: 100%|██████████| 188M/188M [00:04<00:00, 44.7MB/s] 
Downloading data: 100%|██████████| 195k/195k [00:00<00:00, 511kB/s]
Downloading data: 100%|██████████| 6.33k/6.33k [00:00<00:00, 19.7kB/s]
Generating train split: 100%|██████████| 2018904/2018904 [00:02<00:00, 852197.07 examples/s]
Generating valid split: 100%|██████████| 2000/2000 [00:00<00:00, 468846.86 examples/s]
Generating test split: 100%|██████████| 28/28 [00:00<00:00, 25885.06 examples/s]


In [20]:
for domain in DOMAINS:
    print(domain)
    dataset = datasets.load_dataset("liaad/PtBrVId", domain)
    dataset = dataset.map(count_tokens, num_proc=N_PROC)
    for split in ["train", "valid", "test"]:
        print(f"\t{split}")
        sdata = dataset[split]

        for label in [0, 1]:
            print("\t\t", label)
            lsdata = sdata.filter(lambda x: x["label"] == label, num_proc=N_PROC)
            n_docs = len(lsdata)
            print(f"\t\t\tDocs: {n_docs}")
            n_tokens = sum(lsdata["n_tokens"])
            print(f"\t\t\tTkns: {n_tokens}")
            print(f"\t\t\tATkn: {n_tokens/n_docs:.02f}")


journalistic


	train
		 0
			Docs: 1413870
			Tkns: 278684788
			ATkn: 197.11
		 1
			Docs: 328855
			Tkns: 40313699
			ATkn: 122.59
	valid
		 0
			Docs: 1000
			Tkns: 194904
			ATkn: 194.90
		 1
			Docs: 1000
			Tkns: 125341
			ATkn: 125.34
	test
		 0
			Docs: 16
			Tkns: 3651
			ATkn: 228.19
		 1
			Docs: 20
			Tkns: 4027
			ATkn: 201.35
literature
	train
		 0
			Docs: 21258
			Tkns: 2211204
			ATkn: 104.02
		 1
			Docs: 67264
			Tkns: 6530678
			ATkn: 97.09
	valid
		 0
			Docs: 1000
			Tkns: 104882
			ATkn: 104.88
		 1
			Docs: 1000
			Tkns: 94797
			ATkn: 94.80
	test
		 0
			Docs: 21
			Tkns: 2128
			ATkn: 101.33
		 1
			Docs: 15
			Tkns: 1682
			ATkn: 112.13
legal
	train
		 0
			Docs: 460785
			Tkns: 38965551
			ATkn: 84.56
		 1
			Docs: 3631
			Tkns: 306088
			ATkn: 84.30
	valid
		 0
			Docs: 1000
			Tkns: 84857
			ATkn: 84.86
		 1
			Docs: 1000
			Tkns: 85523
			ATkn: 85.52
	test
		 0
			Docs: 21
			Tkns: 1591
			ATkn: 75.76
		 1
			Docs: 16
			Tkns: 1080
			ATkn: 67.50
politics
	train
		 0


Filter (num_proc=28): 100%|██████████| 3809/3809 [00:00<00:00, 12509.92 examples/s]


			Docs: 771
			Tkns: 160551
			ATkn: 208.24
		 1


Filter (num_proc=28): 100%|██████████| 3809/3809 [00:00<00:00, 12435.92 examples/s]


			Docs: 3038
			Tkns: 1394623
			ATkn: 459.06
	valid
		 0


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6692.55 examples/s]


			Docs: 1000
			Tkns: 197962
			ATkn: 197.96
		 1


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6515.32 examples/s] 


			Docs: 1000
			Tkns: 450875
			ATkn: 450.88
	test
		 0


Filter (num_proc=28): 100%|██████████| 48/48 [00:00<00:00, 157.55 examples/s]


			Docs: 19
			Tkns: 6259
			ATkn: 329.42
		 1


Filter (num_proc=28): 100%|██████████| 48/48 [00:00<00:00, 157.87 examples/s]


			Docs: 29
			Tkns: 10764
			ATkn: 371.17
web
	train
		 0


Filter (num_proc=28): 100%|██████████| 138867/138867 [00:00<00:00, 449004.14 examples/s]


			Docs: 48751
			Tkns: 25797316
			ATkn: 529.16
		 1


Filter (num_proc=28): 100%|██████████| 138867/138867 [00:00<00:00, 435875.51 examples/s]


			Docs: 90116
			Tkns: 50733155
			ATkn: 562.98
	valid
		 0


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6333.07 examples/s]


			Docs: 1000
			Tkns: 464462
			ATkn: 464.46
		 1


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6388.32 examples/s] 


			Docs: 1000
			Tkns: 510569
			ATkn: 510.57
	test
		 0


Filter (num_proc=28): 100%|██████████| 34/34 [00:00<00:00, 109.92 examples/s]

			Docs: 17
			Tkns: 8868
			ATkn: 521.65
		 1



Filter (num_proc=28): 100%|██████████| 34/34 [00:00<00:00, 108.96 examples/s]


			Docs: 17
			Tkns: 8993
			ATkn: 529.00
social_media
	train
		 0


Filter (num_proc=28): 100%|██████████| 2018904/2018904 [00:00<00:00, 3822143.21 examples/s]


			Docs: 2013739
			Tkns: 75933432
			ATkn: 37.71
		 1


Filter (num_proc=28): 100%|██████████| 2018904/2018904 [00:00<00:00, 4249184.01 examples/s]


			Docs: 5165
			Tkns: 205038
			ATkn: 39.70
	valid
		 0


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6922.13 examples/s]


			Docs: 1000
			Tkns: 37938
			ATkn: 37.94
		 1


Filter (num_proc=28): 100%|██████████| 2000/2000 [00:00<00:00, 6655.61 examples/s]


			Docs: 1000
			Tkns: 41259
			ATkn: 41.26
	test
		 0


Filter (num_proc=28): 100%|██████████| 28/28 [00:00<00:00, 92.93 examples/s]


			Docs: 15
			Tkns: 1322
			ATkn: 88.13
		 1


Filter (num_proc=28): 100%|██████████| 28/28 [00:00<00:00, 94.50 examples/s]


			Docs: 13
			Tkns: 418
			ATkn: 32.15
