In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import T5TokenizerFast, AutoTokenizer
from huggingface_hub import notebook_login, create_repo, delete_repo
from tqdm.auto import tqdm
import os
import re

In [2]:
current_directory = os.getcwd()

In [3]:
t5_tokenizer = T5TokenizerFast.from_pretrained('t5-small')
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
t5_sentinel_tokens = t5_tokenizer.special_tokens_map['additional_special_tokens']

In [5]:
gpt2_tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

In [6]:
special_tokens_to_add = {
    'bos_token': '<|startoftext|>',
    'pad_token': '<pad>',
    'additional_special_tokens': t5_sentinel_tokens
}
gpt2_tokenizer.add_special_tokens(special_tokens_to_add)
gpt2_tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extr

In [7]:
vocab = gpt2_tokenizer.get_vocab()
print(len(vocab)); print(vocab['<extra_id_99>']); print(vocab['<extra_id_0>'])

50359
50358
50259


In [8]:
print(vocab['<|startoftext|>']); print(vocab['<|endoftext|>']); print(vocab['<pad>'])

50257
50256
50258


In [9]:
gpt2_tokenizer.save_pretrained('my_gpt2_tokenizer')

('my_gpt2_tokenizer\\tokenizer_config.json',
 'my_gpt2_tokenizer\\special_tokens_map.json',
 'my_gpt2_tokenizer\\vocab.json',
 'my_gpt2_tokenizer\\merges.txt',
 'my_gpt2_tokenizer\\added_tokens.json',
 'my_gpt2_tokenizer\\tokenizer.json')

In [10]:
len(vocab)

50359

In [11]:
tokenizer = AutoTokenizer.from_pretrained(current_directory+'\my_gpt2_tokenizer')

In [31]:
ds = load_dataset('genloop/bloomberg_financial_news_120k', split='train')

In [32]:
print(ds)

Dataset({
    features: ['Headline', 'Journalists', 'Date', 'Link', 'Article'],
    num_rows: 120000
})


In [33]:
ds = ds.remove_columns(['Headline', 'Journalists', 'Date', 'Link'])

In [34]:
ds['Article'][0]

'Marriott International Inc. (MAR) , the largest publicly traded U.S. hotel chain, cut its full-year profit forecast after reporting second-quarter results that were in line with analysts’ estimates. The company expects earnings per share for the third quarter of 42 cents to 46 cents, less than the 49-cent average of 29 analyst estimates compiled by Bloomberg. Marriott projected full-year earnings of $1.92 to $2.03 a share, which would miss both the $2.04 that analysts are estimating and the company’s own previous guidance of $1.93 to $2.08. Slowing demand from business groups and U.S. government cutbacks in travel spending weighed on Bethesda, Maryland-based Marriott’s second-quarter earnings, while weakening hotel use abroad may hamper results for the rest of the year, according to Chief Executive Officer Arne Sorenson. “Looking at group business, we remain encouraged in terms of long-term bookings, but short-term group business weakened in the U.S. as the quarter progressed,” Sorens

In [35]:
def preprocessing_text(examples):
    allowed_chars = r"/.,:;’-+%$?!'\""
    allowed_pattern = f'[^a-zA-Z0-9\s{re.escape(allowed_chars)}]'
    texts = []

    for text in examples['Article']:
        # re.split를 사용하여 'To contact the reporter'를 기준으로 문장을 나눔, 나눈 결과의 첫 번째 요소([0])만 가져와서
        # To contact the reporter 이후 모두 제거
        text = re.split('To contact the reporter', text)[0]
        text = text.lower()
        text = re.sub(r'\s*\([^)]*\)\s*,?', '', text) # 소괄호() 포함 소괄호 안에 있는 문자 제거
        text = text.replace('\n', ' ') # \n을 제거하고 공백으로 대체
        text = re.sub(r'-{2,}', ' ', text) # 연속된 하이폰(-) 2개 이상을 공백으로 대체
        text = re.sub(r'\.{2,}', ' ', text) # 연속된 마침표(.) 2개 이상을 공백으로 대체
        text = re.sub(allowed_pattern, ' ', text) # allowed_pattern외에 제거
        text = re.sub(r'\s+', ' ', text).strip() # 연속되는 공백 하나의 공백으로
        texts.append(text)

    return {'text':texts}


In [36]:
ds = ds.map(preprocessing_text, batched=True, remove_columns=ds.column_names)

In [37]:
s = []
for text in tqdm(ds['text']):
    s.extend(tokenizer.encode(text, add_special_tokens=False))

  0%|          | 0/120000 [00:00<?, ?it/s]

In [38]:
len(s)

68427731

In [39]:
d1 = {'text': []}

for i in tqdm(range(0, len(s), 512)):
    d1['text'].extend([s[i:i+512]])

  0%|          | 0/133648 [00:00<?, ?it/s]

In [40]:
len(d1['text'][-2]), len(d1['text'][-1])

(512, 467)

In [41]:
d1['text'].pop()

[1744,
 4277,
 13,
 1111,
 21824,
 19952,
 329,
 257,
 717,
 6246,
 287,
 1115,
 706,
 7463,
 362,
 13,
 21,
 1411,
 319,
 474,
 2062,
 2808,
 319,
 2328,
 6130,
 561,
 2038,
 284,
 4439,
 281,
 8178,
 13,
 1231,
 281,
 4381,
 11,
 262,
 334,
 13,
 82,
 13,
 743,
 423,
 4277,
 276,
 9439,
 13,
 262,
 2156,
 3352,
 5690,
 319,
 262,
 3953,
 1909,
 11,
 290,
 262,
 34548,
 743,
 1061,
 6050,
 13,
 21824,
 547,
 36675,
 276,
 416,
 1705,
 625,
 257,
 15223,
 4381,
 284,
 2620,
 262,
 5057,
 13387,
 706,
 9825,
 625,
 262,
 5041,
 11,
 424,
 67,
 461,
 1477,
 1437,
 555,
 17187,
 37518,
 12647,
 11,
 281,
 12499,
 379,
 2318,
 565,
 592,
 3139,
 287,
 300,
 3391,
 11,
 531,
 287,
 257,
 989,
 1909,
 13,
 17135,
 329,
 390,
 3273,
 7585,
 8618,
 1105,
 13,
 2425,
 16059,
 11,
 393,
 352,
 13,
 23,
 1411,
 11,
 284,
 720,
 22,
 13,
 26279,
 257,
 1323,
 2978,
 416,
 352,
 25,
 1314,
 279,
 13,
 76,
 13,
 300,
 3391,
 640,
 319,
 262,
 442,
 4549,
 3096,
 286,
 3292,
 13,
 3939,
 278,
 17135,

In [42]:
dataset = Dataset.from_dict(d1)

In [43]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 133647
})


In [44]:
train_ds = dataset.train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': train_ds['train'],
    'valid': train_ds['test'],
})

In [45]:
dataset.save_to_disk('token_ids_dataset_for_t5_pretraining')
dataset = load_from_disk(current_directory+'\\token_ids_dataset_for_t5_pretraining')

Saving the dataset (0/1 shards):   0%|          | 0/106917 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26730 [00:00<?, ? examples/s]

In [None]:
create_repo("token_ids_dataset_for_t5_pretraining", repo_type="dataset")
dataset.push_to_hub("hyunjaehyun/token_ids_dataset_for_t5_pretraining")

In [48]:
#from huggingface_hub import delete_repo
#delete_repo(repo_id="hyunjaehyun/token_ids_dataset_for_t5_pretraining", repo_type="dataset")

In [None]:
여기까지가 찐막으로 만든 데이터셋, 아래는 이전에 실험용으로 한 것.
그러니 깃허브 올릴 때에는 이 파일 복사한 다음에, 아랫 부분 제거해서 올릴 것

In [18]:
import nltk

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
d1 = {'text': []}

for text in tqdm(ds['text']):
    sentences = nltk.sent_tokenize(text)

    chunk_ids_list = []

    for sentence in sentences:
        sentence_ids = tokenizer.encode(sentence, add_special_tokens=False)

        if len(chunk_ids_list) + len(sentence_ids) > 512:
            if chunk_ids_list:
                d1['text'].append(chunk_ids_list)
            chunk_ids_list = sentence_ids
        else:
            chunk_ids_list.extend(sentence_ids)

    if chunk_ids_list: # 남은게 있으면
        d1['text'].append(chunk_ids_list)


  0%|          | 0/120000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1602 > 1024). Running this sequence through the model will result in indexing errors


In [21]:
len(d1['text'])

200466

In [22]:
temp = []
for text in d1['text']:
    temp.append(text)

In [23]:
i = 0
num_lists = len(temp)
recompressed = []

while i < num_lists:
    current_list = list(temp[i])
    i += 1

    while i < num_lists:
        next_list = temp[i]
        if len(current_list) + len(next_list) <= 512:
            current_list.extend(next_list)
            i += 1
        else:
            break
    recompressed.append(current_list)

d2 = {
    'text': recompressed,
}

In [24]:
len(d2['text'])

166472

In [203]:
dataset = Dataset.from_dict(d2)

In [204]:
train_ds = dataset.train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': train_ds['train'],
    'valid': train_ds['test'],
})

In [205]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 149668
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 37417
    })
})


In [206]:
dataset.save_to_disk('token_ids_dataset_for_t5_pretraining')
dataset = load_from_disk(current_directory+'\\token_ids_dataset_for_t5_pretraining')

Saving the dataset (0/1 shards):   0%|          | 0/149668 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/37417 [00:00<?, ? examples/s]

In [None]:
#create_repo("token_ids_dataset_for_t5_pretraining", repo_type="dataset")
#dataset.push_to_hub("hyunjaehyun/token_ids_dataset_for_t5_pretraining")

In [128]:
#from huggingface_hub import delete_repo
#delete_repo(repo_id="hyunjaehyun/token_ids_dataset_for_t5_pretraining", repo_type="dataset")