In [None]:
!pip3 install -q -U bitsandbytes
!pip3 install -q -U transformers
!pip3 install -q -U peft
!pip3 install -q -U accelerate
!pip3 install -q -U datasets
!pip3 install -q -U einops
!pip3 install gdown
!pip3 install -U flash-attn --no-build-isolation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

import json
import pandas as pd
import gdown
import numpy as np

from transformers import BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, PeftModel, get_peft_model
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [None]:
device

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#dowload dataset
url_large = "https://drive.google.com/uc?export=download&id=1xw0a8qiAz5Exqy-Udtv8FPFLO54iysvX"
dataset_path = "/law_dataset_large.json"
output_dir = "output_dir"

In [None]:
gdown.download(url_large,dataset_path, quiet = False)

In [None]:
#Training parameters
evaluation_strategy="epoch"
save_strategy='epoch'
logging_strategy = 'epoch'
seed = 42
per_device_train_batch_size=3
gradient_accumulation_steps=2
per_device_eval_batch_size=2
learning_rate= 2e-5
num_train_epochs=21
save_total_limit=3
warmup_steps=10
optim = 'adamw_torch'
lr_scheduler_type="cosine"
push_to_hub = True

r=64
lora_alpha=64
lora_dropout=0.02

#Setting seed
torch.manual_seed(seed)
np.random.seed(seed)

#### Example before finetuning

In [None]:
model =  AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True, torch_dtype=torch.float16, device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Define the question in Portuguese
question = "Quais são os direitos fundamentais garantidos pela Constituição Federal do Brasil?"

# Create the prompt
prompt = f"""
Este é um banco de dados de texto contendo descrições de direitos e conceitos legais no Brasil.
Cada texto descreve um único conceito.

Sua tarefa é entender esses textos e responder a perguntas relacionadas aos direitos no Brasil.

Pergunta: {question}

Resposta:
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to(device)

# Generate text
outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]

# Print the generated response
print(text)

# Train the Model

#### Getting Dataset

In [None]:
def json_to_prompts(data, question = "Quais são as leis, artigos, e instrumentos jurídicos mencionados, utilizados e aplicados nessa decisão judicial?"):
  """
  Converts a JSON string with "text" key in each element of a list to a DataFrame with "text" and "prompt" columns.

  Args:
      json_data: A string containing the JSON data.
      question: The question to ask about the text (defaults to "Quais são os direitos fundamentais garantidos pela Constituição Federal do Brasil?").

  Returns:
      A pandas DataFrame with two columns: "text" and "prompt".
  """

  # Load the JSON data
  texts = [item['text'] for item in data]
  prompts = [f"Este é um texto sobre uma decisão judicial realizada no Supremo Tribunal Federal brasileiro: {text}\n\nPergunta: {question}\n\nResposta:" for text in texts]

  # Create the DataFrame
  df = pd.DataFrame({"text": texts, "prompt": prompts})

  return df

In [None]:
# Opening JSON file
f = open(dataset_path)
data_json = json.load(f)

dataframe = json_to_prompts(data_json)

#### Importing model and setting LORA

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
)

In [None]:
peft_config = LoraConfig(
    r=r,  # dimension of the updated matrices
    lora_alpha=lora_alpha,  # parameter for scaling
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout= lora_dropout,  # dropout probability for layers
    bias="none",
    task_type="CAUSAL_LM",
)


In [None]:
def tokenization(sample, tokenizer, max_length=512, padding="max_length", truncation=True):
  """
  Tokenizes a sample dictionary containing a "Prompt" key according to the provided tokenizer.

  Args:
      sample: A dictionary containing a "Prompt" key with the text to be tokenized.
      tokenizer: A Hugging Face tokenizer instance.
      max_length: The maximum length of the tokenized sequence (default: 1024).
      padding: Padding strategy (default: "max_length"). Options: "max_length", "longest", or "do_not_pad".
      truncation: Whether to truncate the input if it exceeds `max_length` (default: True).

  Returns:
      A dictionary containing the tokenized input and other relevant information.
  """

  # Use the provided tokenizer to process the "Prompt" text
  inputs = tokenizer(sample['prompt'], return_tensors="pt", return_attention_mask=False,
                    padding=padding, truncation=truncation, max_length=max_length)

  return inputs


### Training Phi

In [None]:
phi_model = AutoModelForCausalLM.from_pretrained(
          "microsoft/phi-2", trust_remote_code=True, quantization_config=bnb_config,device_map="auto"
)

phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

In [None]:
phi_model

In [None]:
phi_model = get_peft_model(phi_model, peft_config)

In [None]:
phi_model.print_trainable_parameters()

In [None]:
data_training = Dataset.from_pandas(dataframe.iloc[:int(0.01*len(dataframe))])
tokenized_training_data = data_training.map(
    tokenization,
    batched=True,
    desc='Tokenization',
    remove_columns=data_training.column_names,
    fn_kwargs={"tokenizer": phi_tokenizer}  # Pass the tokenizer as an additional argument
)

In [None]:
data_valid = Dataset.from_pandas(dataframe.iloc[int(0.999*len(dataframe)):])
tokenized_valid_data = data_valid.map(
    tokenization,
    batched=True,
    desc='Tokenization',
    remove_columns=data_training.column_names,
    fn_kwargs={"tokenizer": phi_tokenizer}  # Pass the tokenizer as an additional argument
)

In [None]:
training_arguments = TrainingArguments(
        output_dir = 'gemma_2B',
        evaluation_strategy=evaluation_strategy,
        save_strategy=save_strategy,
        logging_strategy = logging_strategy,
        do_eval=True,
        seed = seed,
        metric_for_best_model = 'eval_loss',
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        save_total_limit=save_total_limit,
        load_best_model_at_end = True,
        warmup_steps=warmup_steps,
        optim = optim,
        lr_scheduler_type=lr_scheduler_type,
        remove_unused_columns=True,
        push_to_hub = push_to_hub,
        hub_model_id = "LucasVitoriano/gemma_2B",
        hub_strategy = 'every_save'
)

In [None]:
trainer = Trainer(
    model=phi_model,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_valid_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False),
)
trainer.train()

In [None]:
logs_phi = pd.DataFrame(trainer.state.log_history)

In [None]:
logs_phi.to_pickle(f"logs_phi{learning_rate}_{r}_{lora_dropout}.pkl")

In [None]:
# Tokenize the prompt
def run_questions(model, tokenizer, questions_df):
    respostas = []
    i=0
    for question in questions_df['Perguntas']:
        prompt = f"""

        Você é um especialista em direito brasileiro. Responda à seguinte pergunta em português usando o seu conhecimento sobre a lei brasileira, a constituição e seus artigos,
        o Supremo Tribunal Federal, etc.

        Pergunta: {question}

        Resposta:
        """
        inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to(device)

        # Generate text
        outputs = model.generate(**inputs, max_new_tokens=200)
        text = tokenizer.batch_decode(outputs)[0]

        # Print the generated response
        # print(text)

        respostas.append(text)
        print(i)
        i+=1
    return respostas

In [None]:
csv_path = '/content/drive/MyDrive/questions_brazilian_law_ptbr.csv'

In [None]:
questions_df = pd.read_csv(csv_path, sep=";")
questions_df

In [None]:
respostas_phi_model = run_questions(phi_model, phi_tokenizer, questions_df)

In [None]:
df_respostas = pd.DataFrame(respostas_phi_model,columns=['Response_Gemma_FT'])
df_respostas

In [None]:
df_respostas.to_pickle('/content/drive/MyDrive/Response_Gemma_FT.pkl')