In [None]:
!huggingface-cli login

In [None]:
from datasets import load_dataset
#load data
dataset = 'David99YY/experiment_CoT_v1'
data = load_dataset(dataset)

In [None]:
import transformers
!pip install -Uqq  git+https://github.com/huggingface/peft.git
!pip install -Uqq transformers datasets accelerate bitsandbytes
#tokenize
from transformers import AutoTokenizer
model = '3B'
model_name = ('togethercomputer/RedPajama-INCITE-Base-3B-v1',
              'togethercomputer/RedPajama-INCITE-Base-3B-v1')
tokenizer = AutoTokenizer.from_pretrained(model_name[1], add_eos_token=True)
tokenizer.pad_token_id = 0

In [None]:
def tokenize(prompt, tokenizer):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=256,
        padding="max_length",
    )
    return {
        "input_ids": result["input_ids"],
        "attention_mask": result["attention_mask"],
    }

In [None]:
train_val = data["train"].train_test_split(
    test_size=500, shuffle=True, seed=42
)
train_data = train_val["train"]
val_data = train_val["test"]

In [None]:
def convert(data_point):
    if data_point["input"]:
        return f"""This is an instruction that explain why this text suggests a matched sentiment, please look at input along with instruction to infer the potential sentiment of market.[Instruction]:
{data_point["instruction"]}[Input]:{data_point["input"]}[Response]:{data_point["output"]}"""

In [None]:
train_data = train_data.shuffle().map(lambda x: tokenize(convert(x), tokenizer))
val_data = val_data.shuffle().map(lambda x: tokenize(convert(x), tokenizer))

Map:   0%|          | 0/1050 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    model_name[0],
    load_in_8bit=True,
    device_map="auto",
)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
lora_config = LoraConfig(
 r= 8,
 lora_alpha=16,
 target_modules=["query_key_value"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(num_train_epochs=1,
                                        learning_rate=5e-4,
                                        logging_steps=30,
                                        evaluation_strategy="steps",
                                        save_strategy="steps",
                                        eval_steps=30,
                                        save_steps=30,
                                        output_dir='./results',
                                        save_total_limit=3,
                                        load_best_model_at_end=True,
                                        push_to_hub=False,
                                        auto_find_batch_size=True
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
30,1.2547,1.347041
60,1.2201,1.325393
90,1.3328,1.291229
120,1.2974,1.276992


Checkpoint destination directory ./results/checkpoint-30 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=132, training_loss=1.2732970281080767, metrics={'train_runtime': 1004.5459, 'train_samples_per_second': 1.045, 'train_steps_per_second': 0.131, 'total_flos': 5021698133852160.0, 'train_loss': 1.2732970281080767, 'epoch': 1.0})

In [None]:
trainer.model.save_pretrained('Redpajama_sentiment')
tokenizer.save_pretrained('Redpajama_sentiment')

('Redpajama_sentiment/tokenizer_config.json',
 'Redpajama_sentiment/special_tokens_map.json',
 'Redpajama_sentiment/tokenizer.json')