In [26]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict
from transformers import T5TokenizerFast, AutoTokenizer
from huggingface_hub import notebook_login, create_repo, delete_repo
import os

current_directory = os.getcwd()

In [27]:
old_tokenizer = T5TokenizerFast.from_pretrained('t5-small')
ds = load_dataset('oliverwang15/us_stock_news_with_price')

In [28]:
train_ds = ds['train']
test_ds = ds['test']

In [29]:
train_ds = train_ds.remove_columns(['date', 'title', 'trading_date', 'exact_trading_date', 'ts_-30', 'ts_-29', 'ts_-28', 'ts_-27', 'ts_-26', 'ts_-25', 'ts_-24', 'ts_-23', 'ts_-22', 'ts_-21', 'ts_-20', 'ts_-19', 'ts_-18', 'ts_-17', 'ts_-16', 'ts_-15', 'ts_-14', 'ts_-13', 'ts_-12', 'ts_-11', 'ts_-10', 'ts_-9', 'ts_-8', 'ts_-7', 'ts_-6', 'ts_-5', 'ts_-4', 'ts_-3', 'ts_-2', 'ts_-1', 'ts_0', 'ts_1', 'ts_2', 'ts_3', 'ts_4', 'ts_5', 'ts_6', 'ts_7', 'ts_8', 'ts_9', 'ts_10', 'ts_11', 'ts_12', 'ts_13', 'ts_14', 'ts_15'])

test_ds = test_ds.remove_columns(['date', 'title', 'trading_date', 'exact_trading_date', 'ts_-30', 'ts_-29', 'ts_-28', 'ts_-27', 'ts_-26', 'ts_-25', 'ts_-24', 'ts_-23', 'ts_-22', 'ts_-21', 'ts_-20', 'ts_-19', 'ts_-18', 'ts_-17', 'ts_-16', 'ts_-15', 'ts_-14', 'ts_-13', 'ts_-12', 'ts_-11', 'ts_-10', 'ts_-9', 'ts_-8', 'ts_-7', 'ts_-6', 'ts_-5', 'ts_-4', 'ts_-3', 'ts_-2', 'ts_-1', 'ts_0', 'ts_1', 'ts_2', 'ts_3', 'ts_4', 'ts_5', 'ts_6', 'ts_7', 'ts_8', 'ts_9', 'ts_10', 'ts_11', 'ts_12', 'ts_13', 'ts_14', 'ts_15'])

In [30]:
def concat_content(example):
    result = []
    for stock_name, content in zip(example['stock'], example['content']):
        content = (
            content.replace('\n', ' '))
        result.append(stock_name + ', ' + content)
    return {'text': result}

ds = train_ds.map(concat_content, batched=True, remove_columns=train_ds.column_names)
test_ds = test_ds.map(concat_content, batched=True, remove_columns=test_ds.column_names)

In [31]:
ds2 = load_dataset('hyunjaehyun/cc_news_stocks_economy_finance', split='train')

In [32]:
ds2 = ds2.remove_columns('categories')
ds2 = ds2.rename_column('plain_text','text')

In [33]:
concat_ds = concatenate_datasets([ds, ds2])

In [34]:
def get_training_corpus():
    return (concat_ds['text'][i:i+1000] for i in range(0, len(concat_ds['text']), 1000))

train_corpus = get_training_corpus()
tokenizer = old_tokenizer.train_new_from_iterator(train_corpus, 52000)

sos_token = {'bos_token': '<s/>'}
tokenizer.add_special_tokens(sos_token)
tokenizer.save_pretrained('my_tokenizer')

1

In [35]:
combined_dataset = DatasetDict({
    'train': concat_ds,
    'test': test_ds
})

combined_dataset.save_to_disk(current_directory)

Saving the dataset (0/1 shards):   0%|          | 0/83652 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/22645 [00:00<?, ? examples/s]

In [None]:
notebook_login()

ds = load_from_disk(current_directory)
create_repo("dataset_for_t5_pretraining", repo_type="dataset")
ds.push_to_hub("hyunjaehyun/dataset_for_t5_pretraining")

In [40]:
ds = load_dataset('hyunjaehyun/dataset_for_t5_pretraining')
tokenizer = AutoTokenizer.from_pretrained(current_directory+'\my_tokenizer')
old_tokenizer = T5TokenizerFast.from_pretrained('t5-small')

In [None]:
# delete_repo(repo_id="hyunjaehyun/dataset_for_t5_pretraining", repo_type="dataset")

In [41]:
ds1 = ds['train']
ds2 = ds['test']

concat_ds = concatenate_datasets([ds1, ds2])

In [42]:
text = 'AAPL, RadioShack (RSH +3.7%) should post outsized gains next year, Barclays says, lifting shares to Overweight. Among positive factors: the addition of T-Mobile (DT) as a third wireless carrier; the iPhone (NASDAQ:AAPL); a new branding campaign; and managing wireless kiosks in Target (NYSE:TGT) stores. "We would be accumulating a position at the current level and view the stock as a solid investment for 2010."'

In [43]:
old_tokenizer.tokenize(text)

['▁',
 'AA',
 'PL',
 ',',
 '▁Radio',
 'S',
 'hack',
 '▁(',
 'RS',
 'H',
 '▁+',
 '3.7',
 '%)',
 '▁should',
 '▁post',
 '▁out',
 'sized',
 '▁gains',
 '▁next',
 '▁year',
 ',',
 '▁Bar',
 'c',
 'lays',
 '▁says',
 ',',
 '▁lifting',
 '▁shares',
 '▁to',
 '▁Over',
 'weight',
 '.',
 '▁',
 'Among',
 '▁positive',
 '▁factors',
 ':',
 '▁the',
 '▁addition',
 '▁of',
 '▁T',
 '-',
 'Mobile',
 '▁(',
 'DT',
 ')',
 '▁as',
 '▁',
 'a',
 '▁third',
 '▁wireless',
 '▁carrier',
 ';',
 '▁the',
 '▁iPhone',
 '▁(',
 'NASDAQ',
 ':',
 'AA',
 'PL',
 ');',
 '▁',
 'a',
 '▁new',
 '▁branding',
 '▁campaign',
 ';',
 '▁and',
 '▁managing',
 '▁wireless',
 '▁kiosk',
 's',
 '▁in',
 '▁Target',
 '▁(',
 'NYSE',
 ':',
 'T',
 'GT',
 ')',
 '▁stores',
 '.',
 '▁"',
 'We',
 '▁would',
 '▁be',
 '▁',
 'accumul',
 'ating',
 '▁',
 'a',
 '▁position',
 '▁at',
 '▁the',
 '▁current',
 '▁level',
 '▁and',
 '▁view',
 '▁the',
 '▁stock',
 '▁as',
 '▁',
 'a',
 '▁solid',
 '▁investment',
 '▁for',
 '▁2010.',
 '"']

In [44]:
tokenizer.tokenize(text)

['▁AAPL',
 ',',
 '▁RadioShack',
 '▁(',
 'RSH',
 '▁+3.7%)',
 '▁should',
 '▁post',
 '▁',
 'outsized',
 '▁gains',
 '▁next',
 '▁year,',
 '▁Barclays',
 '▁says,',
 '▁lifting',
 '▁shares',
 '▁to',
 '▁Overweight.',
 '▁Among',
 '▁positive',
 '▁factors',
 ':',
 '▁the',
 '▁addition',
 '▁of',
 '▁T-Mobile',
 '▁(D',
 'T)',
 '▁as',
 '▁a',
 '▁third',
 '▁wireless',
 '▁carrier',
 ';',
 '▁the',
 '▁iPhone',
 '▁(NASDAQ:AAPL)',
 ';',
 '▁a',
 '▁new',
 '▁branding',
 '▁campaign',
 ';',
 '▁and',
 '▁managing',
 '▁wireless',
 '▁kiosks',
 '▁in',
 '▁Target',
 '▁(NYSE:TGT)',
 '▁stores.',
 '▁"We',
 '▁would',
 '▁be',
 '▁accumulat',
 'ing',
 '▁a',
 '▁position',
 '▁at',
 '▁the',
 '▁current',
 '▁level',
 '▁and',
 '▁view',
 '▁the',
 '▁stock',
 '▁as',
 '▁a',
 '▁solid',
 '▁investment',
 '▁for',
 '▁2010',
 '."']

In [63]:
from tqdm.auto import tqdm

s = []
for text in tqdm(concat_ds['text']):
    s.extend(tokenizer.tokenize(text))

  0%|          | 0/106297 [00:00<?, ?it/s]

In [69]:
len(s)

20116974

In [116]:
d1 = {'text': []}

for i in tqdm(range(0, len(s), 512)):
    d1['text'].extend([s[i:i+512]])

  0%|          | 0/39291 [00:00<?, ?it/s]

In [117]:
len(d1['text'][-1])

494

In [118]:
d1['text'][-1].extend(['<pad>'] * (512-len(d1['text'][-1])))

In [119]:
len(d1['text'][-1])

512

In [121]:
dataset = Dataset.from_dict(d1)

In [122]:
dataset.save_to_disk('token_ids_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/39291 [00:00<?, ? examples/s]

In [123]:
dataset = load_from_disk(current_directory+'\\token_ids_dataset')

In [124]:
train_ds = dataset.train_test_split(test_size=0.2)
test_ds = train_ds['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_ds['train'],
    'valid': test_ds['train'],
    'test': test_ds['test']
})

In [130]:
# create_repo("token_ids_dataset_for_t5_pretraining", repo_type="dataset")
# dataset.push_to_hub("hyunjaehyun/token_ids_dataset_for_t5_pretraining")