In [95]:
from datasets import load_dataset, load_from_disk, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import create_repo
import os
import re

current_directory = os.getcwd()

In [96]:
tokenizer = AutoTokenizer.from_pretrained(current_directory+'\my_gpt2_tokenizer')

In [97]:
ds = load_dataset('ugursa/Yahoo-Finance-News-Sentences', split='train') # sentiment analysis # multiclass classification

In [98]:
def preprocessing_ds1_text(examples):
    allowed_chars = r"/.,:;’-+%$?!'\""
    allowed_pattern = f'[^a-zA-Z0-9\s{re.escape(allowed_chars)}]'

    prefix = 'sentiment: '
    label_map = {0: 'bullish', 1: 'neutral', 2: 'bearish'} # 0은 bullish(강세), 1은 neutral(중립), 2는 bearish(약세)

    texts = []
    labels = []

    for text, label in zip(examples['text'], examples['label']):
        text = text.lower()
        text = re.sub(r'\s*\([^)]*\)\s*,?', '', text)
        text = re.sub(r'-{2,}', ' ', text)
        text = re.sub(r'\.{2,}', ' ', text)
        text = re.sub(allowed_pattern, ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

        text = prefix + text

        token_ids = tokenizer.encode(text, add_special_tokens=False)
        if len(token_ids) <= 512:
            texts.append(text)
            labels.append(label_map[label])

    return {'text':texts, 'labels':labels}

In [99]:
ds = ds.map(preprocessing_ds1_text, batched=True, remove_columns=ds.column_names)

In [100]:
print(ds)

Dataset({
    features: ['text', 'labels'],
    num_rows: 25032
})


In [101]:
def tokenize_function(examples):
    text = tokenizer(examples['text'], max_length=512, truncation=True, padding='max_length', return_attention_mask=False)
    labels = tokenizer(text_target=examples['labels'], max_length=2, padding='max_length', return_attention_mask=False)
    examples['labels'] = labels['input_ids']
    return {
        'text': text['input_ids'],
        'labels': labels['input_ids']
    }

In [102]:
dataset = ds.map(tokenize_function, batched=True, remove_columns=['text'])

In [103]:
print(dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 25032
})


In [None]:
def tokenize_function(examples):
    text = tokenizer(examples['text'], max_length=512, truncation=True, padding='max_length', return_attention_mask=False)
    labels = tokenizer(text_target=examples['labels'], max_length=2, padding='max_length', return_attention_mask=False)
    examples['labels'] = labels['input_ids']
    return {
        'text': text['input_ids'],
        'labels': labels['input_ids']
    }

In [107]:
ds['labels'][0], ds['labels'][4], ds['labels'][12]

('bullish', 'neutral', 'bearish')

In [109]:
def tokenize_function2(examples):
    labels_map = {'bullish': 0, 'neutral': 1, 'bearish': 2}

    text = tokenizer(examples['text'], max_length=512, truncation=True, padding='max_length', return_attention_mask=False)
    text['labels'] = [labels_map[i] for i in examples['labels']]
    return text

In [110]:
dataset2 = ds.map(tokenize_function2, batched=True, remove_columns=['text'])

Map:   0%|          | 0/25032 [00:00<?, ? examples/s]

In [111]:
print(dataset2)

Dataset({
    features: ['labels', 'input_ids'],
    num_rows: 25032
})


In [113]:
print(ds['labels'][0]); print(dataset['labels'][0]); print(dataset2['labels'][0])

bullish
[16308, 680]
0


In [85]:
  # split train 70%/valid 15%/test 15%
train_test_valid = dataset.train_test_split(shuffle=True, seed=42, test_size=0.3)
test_valid = train_test_valid['test'].train_test_split(shuffle=True, seed=42, test_size=0.50)

dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

In [86]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 17522
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3755
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 3755
    })
})


In [88]:
dataset.save_to_disk('token_ids_dataset_for_t5_finetuning')
dataset = load_from_disk(current_directory+'\\token_ids_dataset_for_t5_finetuning')

Saving the dataset (0/1 shards):   0%|          | 0/17522 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3755 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3755 [00:00<?, ? examples/s]

In [89]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 17522
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3755
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 3755
    })
})


In [115]:
train_test_valid = dataset2.train_test_split(shuffle=True, seed=42, test_size=0.3)
test_valid = train_test_valid['test'].train_test_split(shuffle=True, seed=42, test_size=0.50)

dataset2 = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

In [118]:
dataset.save_to_disk('token_ids_dataset_for_t5_finetuning2')
dataset = load_from_disk(current_directory+'\\token_ids_dataset_for_t5_finetuning2')

Saving the dataset (0/1 shards):   0%|          | 0/25032 [00:00<?, ? examples/s]

In [119]:
print(dataset2)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 17522
    })
    test: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 3755
    })
    valid: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 3755
    })
})


In [126]:
print(dataset2['train']['labels'][0])

1


In [94]:
#create_repo("token_ids_dataset_for_t5_finetuning", repo_type="dataset")
#dataset.push_to_hub("hyunjaehyun/token_ids_dataset_for_t5_finetuning")

In [129]:
#create_repo("token_ids_dataset_for_t5_finetuning2", repo_type="dataset")
#dataset2.push_to_hub("hyunjaehyun/token_ids_dataset_for_t5_finetuning2")

In [127]:
#from huggingface_hub import delete_repo
#delete_repo(repo_id="hyunjaehyun/token_ids_dataset_for_t5_finetuning2", repo_type="dataset")