In [1]:
print("Test Scucessfully!")

Test Scucessfully!


# Load Dataset and Model

In [2]:
from datasets import load_dataset

# Load Dataset
dataset_name = "OneFly7/llama2-sst2-training"
dataset = load_dataset(dataset_name, split="train")

Downloading readme:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/9130/.cache/huggingface/datasets/OneFly7___parquet/OneFly7--llama2-sst2-training-c626d3d2fa7dfa86/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/9130/.cache/huggingface/datasets/OneFly7___parquet/OneFly7--llama2-sst2-training-c626d3d2fa7dfa86/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

## Version 2-7b for finetuning
base_model_name = "meta-llama/Llama-2-7b-hf"
# Version 2-13b-chat for few-shot inference
# base_model_name = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)

base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = "./results"

In [7]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=500
)

max_seq_length = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)



Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

In [8]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.0835
20,1.5114
30,0.6752
40,0.4987
50,0.5359
60,0.5623
70,0.5244
80,0.485
90,0.506
100,0.5226


TrainOutput(global_step=500, training_loss=0.5854537553787231, metrics={'train_runtime': 2480.1338, 'train_samples_per_second': 3.226, 'train_steps_per_second': 0.202, 'total_flos': 1.60921560416256e+16, 'train_loss': 0.5854537553787231, 'epoch': 0.12})

In [11]:
system_prompt = "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: positive or negative."
# text = """<s>[INST] <<SYS>>
# {}
# <</SYS>>

# {} [/INST]""".format(system_prompt, user_msg_1)
text = "Classify the sentiment of the following sentence into one of the following categories: positive or negative. \nSentence: like a giant commercial for universal studios , where much of the action takes place \nSentiment:"

inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = base_model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Classify the sentiment of the following sentence into one of the following categories: positive or negative. 
Sentence: like a giant commercial for universal studios , where much of the action takes place 
Sentiment: negative 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 
Sentiment: positive 
Suggested solution: positive 


## Evaluate

In [29]:
validation_dataset = load_dataset('glue', 'sst2', split='validation') 

Found cached dataset glue (/home/9130/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [57]:
import re
from tqdm import tqdm

system_prompt = "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: positive or negative."
user_prompt = "Classify the sentiment of the following sentence into one of the following categories: positive or negative."


def evaluate(dataset):
    label_map = {
        0 : 'negative',
        1 : 'positive',
    }

    compared_result = []

    for i, val in enumerate(tqdm(dataset)):
        label_text = label_map[val['label']]
        sentence = val['sentence']

        # Make input
        text = "<s>[INST] " + system_prompt + "\n<<SYS>>\n\n" + "Sentence: " + sentence + "[/INST]\n" + "Sentiment: "
        inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Generate
        outputs = base_model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=80, pad_token_id=tokenizer.eos_token_id)
        outputs_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        matches = re.findall(r"Sentiment: (.+)", outputs_text)
        selected_sentiment = matches[0].strip()
        selected_sentiment = selected_sentiment.split()[0]

        # Compare prediction and label
        if selected_sentiment == label_text:
            compared_result.append(1)
        else:
            compared_result.append(0)

    return compared_result


In [66]:
res_list = evaluate(validation_dataset.select(range(100)))

100%|██████████| 100/100 [26:01<00:00, 15.62s/it]


In [69]:
print("Accuracy:", res_list.count(1)/len(res_list))

Accuracy: 0.92


In [45]:
# Run text generation pipeline with our next model
system_prompt = "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: positive or negative."
prompt = "<<SYS>> \n" + system_prompt + "\n<<SYS>>\n\nSentence: in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey ."
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=150)
result = pipe(f"<s>[INST] {prompt} [/INST]\nSentiment:")
print(result[0]['generated_text'])



<s>[INST] <<SYS>> 
You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: positive or negative.
<<SYS>>

Sentence: in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . [/INST]
Sentiment: negative 10 2299 10229 10229 10229 10229 
