In [1]:
import multiprocessing as mp
import datasets

import numpy as np
from nltk.tokenize import word_tokenize
from ptvid.constants import DOMAINS, DATASET_NAME

N_PROC = mp.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def count_tokens(example):
    n_tokens = len(word_tokenize(example["text"], language="portuguese"))
    example["n_tokens"] = n_tokens
    return example

In [12]:
dataset = datasets.concatenate_datasets([datasets.load_dataset(DATASET_NAME, domain, split=split) for domain in DOMAINS for split in ["train", "valid", "test"]])
dataset = dataset.map(count_tokens, num_proc=N_PROC)
    
for label in [0, 1]:
    print("\t\t", label)
    lsdata = dataset.filter(lambda x: x["label"] == label, num_proc=N_PROC)
    n_docs = len(lsdata)
    print(f"\t\t\tDocs: {n_docs}")
    n_tokens = sum(lsdata['n_tokens'])
    print(f"\t\t\tTkns: {n_tokens}")
    print(f"\t\t\tmTkns: {min(lsdata['n_tokens'])}")
    print(f"\t\t\tMTkns: {max(lsdata['n_tokens'])}")
    print(f"\t\t\tATkn: {n_tokens/n_docs:.02f}")
    print(f"\t\tStdTkn: {np.std(lsdata['n_tokens']):.02f}")


Map (num_proc=28): 100%|██████████| 7304438/7304438 [02:45<00:00, 44258.29 examples/s] 


		 0


Filter (num_proc=28): 100%|██████████| 7304438/7304438 [00:00<00:00, 9443693.93 examples/s] 


			Docs: 6859951
			Tkns: 418642791
			mTkns: 6
			MTkns: 2042
			ATkn: 61.03
		StdTkn: 74.03
		 1


Filter (num_proc=28): 100%|██████████| 7304438/7304438 [00:00<00:00, 9860270.41 examples/s] 


			Docs: 444487
			Tkns: 56125135
			mTkns: 6
			MTkns: 2075
			ATkn: 126.27
		StdTkn: 205.59


In [10]:
for domain in DOMAINS:
    print(domain)
    dataset = datasets.load_dataset(DATASET_NAME, domain)
    dataset = dataset.map(count_tokens, num_proc=N_PROC)
    sdata = datasets.concatenate_datasets([dataset[split] for split in ["train", "valid", "test"]])

    for label in [0, 1]:
        print("\t\t", label)
        lsdata = sdata.filter(lambda x: x["label"] == label, num_proc=N_PROC)
        n_docs = len(lsdata)
        print(f"\t\t\tDocs: {n_docs}")
        n_tokens = sum(lsdata['n_tokens'])
        print(f"\t\t\tTkns: {n_tokens}")
        print(f"\t\t\tmTkns: {min(lsdata['n_tokens'])}")
        print(f"\t\t\tMTkns: {max(lsdata['n_tokens'])}")
        print(f"\t\t\tATkn: {n_tokens/n_docs:.02f}")
        print(f"\t\tStdTkn: {np.std(lsdata['n_tokens']):.02f}")


journalistic


		 0
			Docs: 1443422
			Tkns: 189506320
			mTkns: 16
			MTkns: 475
			ATkn: 131.29
		StdTkn: 61.45
		 1
			Docs: 333903
			Tkns: 27077538
			mTkns: 18
			MTkns: 560
			ATkn: 81.09
		StdTkn: 39.11
literature
		 0
			Docs: 24090
			Tkns: 1859660
			mTkns: 16
			MTkns: 186
			ATkn: 77.20
		StdTkn: 37.39
		 1
			Docs: 52458
			Tkns: 3805896
			mTkns: 17
			MTkns: 185
			ATkn: 72.55
		StdTkn: 36.19
legal
		 0
			Docs: 2957980
			Tkns: 152717737
			mTkns: 16
			MTkns: 139
			ATkn: 51.63
		StdTkn: 24.43
		 1
			Docs: 4653
			Tkns: 221167
			mTkns: 20
			MTkns: 124
			ATkn: 47.53
		StdTkn: 22.11
politics
		 0
			Docs: 27887
			Tkns: 7203739
			mTkns: 20
			MTkns: 798
			ATkn: 258.32
		StdTkn: 173.39
		 1
			Docs: 3656
			Tkns: 1012586
			mTkns: 21
			MTkns: 796
			ATkn: 276.97
		StdTkn: 177.60
web
		 0
			Docs: 43630
			Tkns: 22598587
			mTkns: 22
			MTkns: 2042
			ATkn: 517.96
		StdTkn: 414.72
		 1
			Docs: 44313
			Tkns: 23913771
			mTkns: 15
			MTkns: 2075
			ATkn: 539.66
		StdTkn: 463.16


In [7]:
for domain in DOMAINS:
    print(domain)
    dataset = datasets.load_dataset(DATASET_NAME, domain)
    dataset = dataset.map(count_tokens, num_proc=N_PROC)
    
    data = datasets.concatenate_datasets([dataset[split] for split in ["train", "valid", "test"]])
    dataset = dataset.map(count_tokens, num_proc=N_PROC)

    n_docs = len(data)
    n_tokens = sum(data["n_tokens"])
    min_tokens = min(data["n_tokens"])
    max_tokens = max(data["n_tokens"])
    avg_tokens = n_tokens / n_docs 

    print(f"\t\tDocs: {n_docs}")
    print(f"\t\tTkns: {n_tokens}")
    print(f"\t\tmTkns: {min_tokens}")
    print(f"\t\tMTkns: {max_tokens}")
    print(f"\t\tATkn: {avg_tokens:.02f}")
    print(f"\t\tStdTkn: {np.std(data['n_tokens']):.02f}")

journalistic


		Docs: 1777325
		Tkns: 216583858
		mTkns: 16
		MTkns: 560
		ATkn: 121.86
		StdTkn: 61.15
literature
		Docs: 76548
		Tkns: 5665556
		mTkns: 16
		MTkns: 186
		ATkn: 74.01
		StdTkn: 36.63
legal
		Docs: 2962633
		Tkns: 152938904
		mTkns: 16
		MTkns: 139
		ATkn: 51.62
		StdTkn: 24.42
politics
		Docs: 31543
		Tkns: 8216325
		mTkns: 20
		MTkns: 798
		ATkn: 260.48
		StdTkn: 173.98
web
		Docs: 87943
		Tkns: 46512358
		mTkns: 15
		MTkns: 2075
		ATkn: 528.89
		StdTkn: 439.93
social_media
		Docs: 2368446
		Tkns: 44850925
		mTkns: 6
		MTkns: 646
		ATkn: 18.94
		StdTkn: 9.86


In [5]:
np.std(n_tokens)

0.0

In [6]:
for domain in DOMAINS:
    print(domain)
    dataset = datasets.load_dataset(DATASET_NAME, domain)
    dataset = dataset.map(count_tokens, num_proc=N_PROC)
    
    data = datasets.concatenate_datasets([dataset[split] for split in ["train", "valid", "test"]])
    dataset = dataset.map(count_tokens, num_proc=N_PROC)

    n_docs = len(data)
    n_tokens = sum(data["n_tokens"])
    min_tokens = min(data["n_tokens"])
    max_tokens = max(data["n_tokens"])
    avg_tokens = n_tokens / n_docs 

journalistic


literature
legal
politics
web
social_media
