## Processar saídas das LLMs

### Configurações necessárias

In [None]:
%pip install transformers torch accelerate bitsandbytes
%pip install --upgrade transformers
%pip install dotenv

In [None]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
from huggingface_hub import login
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()  # procura por .env na raiz do projeto
api_key = os.getenv("API_KEY")


In [None]:
python_input_dir = Path("QuixBugs/python_programs")
python_test_dir = Path("QuixBugs/python_testcases")
java_input_dir = Path("QuixBugs/java_programs")
java_test_dir = Path("QuixBugs/java_testcases")

Path("llm_outputs").mkdir(exist_ok=True)
python_output_llama_dir = Path("llm_outputs/python_programs_corrected_by_llama")
python_output_deepseek_dir = Path("llm_outputs/python_programs_corrected_by_deepseek")
java_output_llama_dir = Path("llm_outputs/java_programs_corrected_by_llama")
java_output_deepseek_dir = Path("llm_outputs/java_programs_corrected_by_deepseek")

python_output_llama_dir.mkdir(exist_ok=True)
python_output_deepseek_dir.mkdir(exist_ok=True)
java_output_llama_dir.mkdir(exist_ok=True)
java_output_deepseek_dir.mkdir(exist_ok=True)

### Configuração da LLM

In [None]:
login(token=api_key)
models_ids = ['meta-llama/Llama-3.2-3B-Instruct', 'deepseek-ai/deepseek-llm-7b-chat']
model_id = models_ids[0]

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(model_id)
config.rope_scaling = { "type": "linear", "factor": 8.0 }  # Adjust the factor as needed

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map='auto')

In [None]:
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,
    top_p=0.8,
    repetition_penalty=1.2,
    do_sample=True
)

In [None]:
def get_response(message):
    response = pipe(message)
    print(response[0]['generated_text'][-1]['content'])
    return response[0]['generated_text'][-1]['content']

### Prompt

In [None]:
def correct_with_ai(code: str, language: str = "python", ai: str = "llama"):
  prompt = [
    {
        "role": "system",
        "content": (
            "You are a helpful AI programming assistant. When the user sends you a piece of code that contains a bug, "
            "your job is to return the corrected version of the code. "
            "Do not include explanations, comments, or any text outside the code block. "
            f"Only return the corrected code in a single code block, in {language}."
        )
    },
    {
        "role": "user",
        "content": f"{code}"
    }
  ]
  print("Output Code ---------------------------------------------")
  return get_response(prompt)

### Função para processar Prompt

In [None]:
def process_files(input_dir: Path, output_dir: Path, language: str):
    for file_path in input_dir.glob("*.java" if language == "java" else "*.py"):
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()

        corrected_code = correct_with_ai(code, language)

        output_file = output_dir / file_path.name
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(corrected_code)
        print(f"[{language.upper()}] Corrigido e salvo: {output_file}")

### Processamentos

In [None]:
process_files(python_input_dir, python_output_llama_dir, "python")

In [None]:
process_files(java_input_dir, java_output_llama_dir, "java")

In [None]:
process_files(python_input_dir, python_output_deepseek_dir, "python")

In [None]:
process_files(java_input_dir, java_output_deepseek_dir, "java")

### Limpar saídas

In [None]:
def clean_code(target_dir: Path, language: str):
  test_files = list(target_dir.glob("*.py" if language == "python" else "*.java"))

  for file in test_files:
      content = file.read_text(encoding="utf-8")

      if language == "python":
        cleaned = content.replace("```python", "").replace("```", "").strip()
      elif language == "java":
        cleaned = content.replace("```java", "").replace("```", "").strip()

      file.write_text(cleaned, encoding="utf-8")
      print(f"Limpo: {file.name}")

In [None]:
clean_code(python_output_llama_dir, "python")
clean_code(python_output_deepseek_dir, "python")
clean_code(java_output_llama_dir, "java")
clean_code(java_output_deepseek_dir, "java")