In [3]:
from datasets import load_dataset

# Load Dataset
dataset_name = "glue"
task_name = "sst2"
train_set = load_dataset(dataset_name, task_name, split="train")
validation_set = load_dataset(dataset_name, task_name, split='validation')

In [5]:
def create_label_text(label):
    label_map = {
        0 : 'negative',
        1 : 'positive',
    }
    
    return {'label_text': label_map[label]}

In [7]:
# Convert digital labels to text labels
train_set = train_set.map(create_label_text, input_columns=['label'])
validation_set = validation_set.map(create_label_text, input_columns=['label'])

Map: 100%|██████████| 872/872 [00:00<00:00, 1679.66 examples/s]


In [8]:
train_set[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'label_text': 'negative'}

In [21]:
def convert_into_prompt_template(system_prompt, user_message, train=True, label_text=""):
    if train:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message}\nSentiment: [/INST] {label_text} </s>"
    else:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message}\nSentiment: [/INST] "

    return text

In [22]:
print(
    convert_into_prompt_template(
        "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the sentence into one of the following categories: 'positive' or 'negative'.",
        train_set[0]['sentence'],
        train=True, 
        label_text="positive"
        )
    )

<s>[INST] <<SYS>>
You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the sentence into one of the following categories: 'positive' or 'negative'.
<</SYS>>

Sentence: hide new secretions from the parental units 
Sentiment: [/INST] positive </s>


In [29]:
def map_dataset(system_prompt, dataset, train=True):
    # Convert the dataset into the format required by the model
    def convert(sentence, label_text):
        converted_inputs = convert_into_prompt_template(system_prompt, sentence, train, label_text)
        return {'text': converted_inputs, 'label_text': label_text}

    return dataset.map(convert, input_columns=['sentence', 'label_text'], batched=False, remove_columns=['sentence', 'label', 'idx', 'label_text'])

In [33]:
system_prompt = "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the sentence into one of the following categories: 'positive' or 'negative'."
new_train = map_dataset(system_prompt, train_set, train=True)
new_validation = map_dataset(system_prompt, validation_set, train=False)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map: 100%|██████████| 872/872 [00:00<00:00, 4812.75 examples/s]


In [34]:
print(new_train[0]["text"])
print(new_validation[0]["text"])

<s>[INST] <<SYS>>
You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the sentence into one of the following categories: 'positive' or 'negative'.
<</SYS>>

Sentence: hide new secretions from the parental units 
Sentiment: [/INST] negative </s>
<s>[INST] <<SYS>>
You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the sentence into one of the following categories: 'positive' or 'negative'.
<</SYS>>

Sentence: it 's a charming and often affecting journey . 
Sentiment: [/INST] 


In [35]:
from datasets import DatasetDict

dataset_to_upload = DatasetDict({
    'train': new_train,
    'validation': new_validation
})

In [36]:
dataset_to_upload

DatasetDict({
    train: Dataset({
        features: ['label_text', 'text'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['label_text', 'text'],
        num_rows: 872
    })
})

In [38]:
dataset_to_upload.push_to_hub("OneFly7/llama2-SST2-SFT-with-system-prompt")

Creating parquet from Arrow format: 100%|██████████| 68/68 [00:00<00:00, 506.08ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 222.37ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
