In [1]:
from datasets import load_dataset

# Load Dataset
dataset_name = "glue"
task_name = "sst2"
dataset = load_dataset(dataset_name, task_name, split="train")
validation_set = load_dataset('glue', 'sst2', split='validation')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_label_text(label):
    label_map = {
        0 : 'negative',
        1 : 'positive',
    }
    
    return {'label_text': label_map[label]}

In [3]:
# Convert digital labels to text labels
dataset = dataset.map(create_label_text, input_columns=['label'])
validation_set = validation_set.map(create_label_text, input_columns=['label'])

In [4]:
dataset[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'label_text': 'negative'}

In [5]:
def convert_into_prompt_template(user_message, train=True, label_text=""):
    if train:
        text = f"<s>[INST] Sentence: {user_message} \nSentiment: [/INST] {label_text} </s></s>"
    else:
        text = f"<s>[INST] Sentence: {user_message} \nSentiment: [/INST] "

    return text

In [6]:
print(convert_into_prompt_template("I love you", train=True, label_text="positive"))

Sentence: I love you 
Sentiment: positive


In [7]:
def map_dataset(dataset, train=True):
    # Convert the dataset into the format required by the model
    def convert(sentence, label_text):
        converted_inputs = convert_into_prompt_template(sentence, train, label_text)
        return {'text': converted_inputs, 'label_text': label_text}

    return dataset.map(convert, input_columns=['sentence', 'label_text'], batched=False, remove_columns=['sentence', 'label', 'idx', 'label_text'])

In [8]:
new_dataset = map_dataset(dataset, train=True)
new_validation_dataset = map_dataset(validation_set, train=False)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map: 100%|██████████| 67349/67349 [00:09<00:00, 6736.77 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 4668.29 examples/s]


In [9]:
new_dataset[0]

{'label_text': 'negative',
 'text': 'Sentence: hide new secretions from the parental units  \nSentiment: negative'}

In [10]:
new_validation_dataset[0]

{'label_text': 'positive',
 'text': "Sentence: it 's a charming and often affecting journey .  \nSentiment: "}

In [11]:
from datasets import DatasetDict

dataset_to_upload = DatasetDict({
    'train': new_dataset,
    'validation': new_validation_dataset
})

In [12]:
dataset_to_upload

DatasetDict({
    train: Dataset({
        features: ['label_text', 'text'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['label_text', 'text'],
        num_rows: 872
    })
})

In [13]:
dataset_to_upload.push_to_hub("OneFly7/llama2-SST2-no-template")

Creating parquet from Arrow format: 100%|██████████| 68/68 [00:00<00:00, 591.28ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 236.23ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
