In [None]:
! pip install -q accelerate peft bitsandbytes transformers trl py7zr auto-gptq optimum

In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
from trl import SFTTrainer
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# path = "/content/drive/MyDrive/Inlab/Datasets/Prova_Ner/"
path = "/content/drive/MyDrive/MAI/HLE/Data/"
path = "../data/finetuning/Mistral-0.2/"
# path = "/content/drive/MyDrive/MAI/Q3/HLE/Project/Data/Finetuning/Mistral/First Format/"
train_data = pd.read_csv(path+"train.csv")
dev_data = pd.read_csv(path+"dev.csv")
test_data = pd.read_csv(path+"test.csv")

In [None]:
text_col = "text"
train_dataset = Dataset.from_pandas(train_data[[text_col]].sample(frac=1, random_state=42).reset_index(drop=True))
dev_dataset = Dataset.from_pandas(dev_data[[text_col]])
test_dataset = Dataset.from_pandas(test_data[[text_col]])

In [None]:
test_data['text'].iloc[0]

In [None]:
# model_name = "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
gptq_config = GPTQConfig(bits=4, use_exllama=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config = gptq_config,  torch_dtype=torch.float32)
print(model)

In [None]:
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj",
                    # "up_proj",
                    # "o_proj",
                    # "k_proj",
                    # "down_proj",
                    # "gate_proj",
                    "v_proj"]
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

In [None]:
training_arguments = TrainingArguments(
    output_dir="../models/mistral-2.0/", #Important indicar be la direcció
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=500,
    max_grad_norm=0.3,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    save_strategy="steps",
    save_steps=500,
    logging_strategy='steps',
    logging_steps = 500,
    num_train_epochs=1,
    # max_steps=250,
    save_total_limit=2,
    group_by_length = True,
    fp16=False,
    push_to_hub=False
)
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1000 #mirar si es adecuat
)

In [None]:
trainer.train()

In [None]:
# ! cp -r /content/mistral-finetuned-ner/checkpoint-200/ /content/drive/MyDrive/MAI/Q3/HLE/Project/Models/Mistral_7B_200/

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Inference

In [None]:
test_data['sentence'].iloc[0]

In [None]:
instruction = "You are solving the NER problem in indian legal documents. You have to extract from the text, entities related to each of the following categories: CASE_NUMBER, COURT, DATE, GPE, JUDGE, LAWYER, ORG, OTHER_PERSON, PETITIONER, PRECEDENT, PROVISION, RESPONDENT, STATUTE, WITNESS. Extract them exactly as they are in the text (Don\'t format them). Be careful with synonyms, some categories can be found under other names."
sentence = test_data['sentence'].iloc[0]
text = f"""<s>[INST] {instruction} Find the entities in the following text: {sentence} [/INST]\n"""

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch
# tokenizer = AutoTokenizer.from_pretrained("/content/mistral-finetuned-tickets/checkpoint-13")


# model = AutoPeftModelForCausalLM.from_pretrained(
#     "/content/mistral-finetuned-tickets/checkpoint-60",
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map="cuda")


In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=500,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
import time
st_time = time.time()
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(time.time()-st_time)

In [None]:
from huggingface_hub import notebook_login

notebook_login() #### Important donar token de write
#create repo of the model

In [None]:
from huggingface_hub import Repository

# Create a local directory to save your model
!mkdir Mistral-7B-LegalNER

repo = Repository(local_dir="Mistral-7B-LegalNER", clone_from="Huss9/Mistral-7B-LegalNER")
repo.git_pull()
!cp -r /content/drive/MyDrive/MAI/Q3/HLE/Project/Models/Mistral_finetuned_ner/checkpoint-250/* ./Mistral-7B-LegalNER/
repo.git_add(auto_lfs_track=True)
repo.git_commit("After 250 steps")
repo.git_push()

In [None]:
repo.git_add(all=True)

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained('../models/mistral-2.0/epoch/')
trainer.create_model_card()
model.save_pretrained('../models/mistral-2.0/epoch/')