In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

from src.analytics_module.config import Config

In [2]:
config = Config()

# Init tokenizer and model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(config.prod_model, trust_remote_code=True)

# tokenizer.pad_token = tokenizer.eos_token

# bnb_config = BitsAndBytesConfig(
#    load_in_8bit=True,
# #    bnb_4bit_quant_type="nf4",
# #    bnb_4bit_use_double_quant=True,
#    bnb_8bit_compute_dtype=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(config.prod_model, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Prepare data

In [5]:
df = pd.read_parquet(config.data_file_path)
df

Unnamed: 0,Class,HTTP
0,False,Method: GET\nURI: /tienda1/index.jsp\nHost-Hea...
1,False,Method: GET\nURI: /tienda1/publico/anadir.jsp\...
2,False,Method: POST\nURI: /tienda1/publico/anadir.jsp...
3,False,Method: GET\nURI: /tienda1/publico/autenticar....
4,False,Method: POST\nURI: /tienda1/publico/autenticar...
...,...,...
84952,True,Method: GET\nURI: /rE8DdhJ/etdhkynu/rrnmmnigmi...
84953,True,Method: GET\nURI: /gaPdpj5.asp\nHost-Header: H...
84954,True,Method: GET\nURI: /e-QJHS/2QbmonuFzt7mzoe1oi/6...
84955,True,Method: GET\nURI: /unwosgmihia/Tlwjatcoc4oleoi...


In [7]:
with open(config.user_prompt_path) as user_prompt_file:
    user_prompt = user_prompt_file.read()

with open(config.system_prompt_path) as system_prompt_file:
    system_prompt = system_prompt_file.read()

In [None]:
X_train, X_test = train_test_split(df, train_size=0.8, random_state=config.random_seed)

In [None]:
def tokenize_function(data):

    msg = [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": f"'''{data["HTTP"]}'''\n{user_prompt}"
            },
            {
                "role": "user",
                "content": f"{data["Class"]}"
            },
        ]
        ]
    # extract text
    text = examples["HTTP"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    return tokenized_inputs

# LoRA

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, peft_type=TaskType.CAUSAL_LM)
peft_model = get_peft_model(model, peft_config)

print(peft_model.print_trainable_parameters())

In [8]:
if torch.cuda.device_count() > 1: 
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
                    model = peft_model, 
                    train_dataset=train_dataset, 
                    eval_dataset = test_dataset,
                    tokenizer = tokenizer, 
                    data_collator = data_collator, 

                    args = TrainingArguments(
                        output_dir="./training",
                        remove_unused_columns=False,
                        per_device_train_batch_size=2,
                        gradient_checkpointing=True,
                        gradient_accumulation_steps=4,
                        max_steps=200,
                        learning_rate=2.5e-5, 
                        logging_steps=5,
                        fp16=True,
                        optim="paged_adamw_8bit",
                        save_strategy="steps",     
                        save_steps=50,             
                        evaluation_strategy="steps",
                        eval_steps=5,              
                        do_eval=True,
                        label_names = ["input_ids", "labels", "attention_mask"],
                        report_to = "none",
                        
                ))

In [None]:
trainer.train()