In [None]:
!pip install -U bitsandbytes # Ensure bitsandbytes is up-to-date

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch # Import torch for device handling and dtypes

# Define the base model name
base_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"

# Load base model with appropriate data type (e.g., bfloat16 for modern GPUs)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto" # Automatically map model to available devices (GPU/CPU)
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True # Needed for some tokenizers
)
tokenizer.pad_token = tokenizer.eos_token # Set pad token for consistent generation

# Load the Peft model (fine-tuned adapter)
model = PeftModel.from_pretrained(base_model, "IbarraOrtizDev/agatec_cafe")

# Set the model to evaluation mode for inference
model.eval()

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [None]:
SYSTEM_PROMPT = """Eres un asistente experto en café colombiano entrenado con información de Cenicafé.
Tu objetivo es responder preguntas sobre cultivo, variedades, productividad y prácticas agronómicas del café en Colombia.
Proporciona respuestas precisas, informativas y basadas en conocimiento técnico."""

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-cwix2b1k/unsloth_aeabd63bbd094a7889ab48e533d89c99
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-cwix2b1k/unsloth_aeabd63bbd094a7889ab48e533d89c99
  Resolved https://github.com/unslothai/unsloth.git to commit d1e312dcdc57bf020aa0f6da810226efe79cd69a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.11.6 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.11.6-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

Collecting unsloth
  Downloading unsloth-2025.11.6-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 kB[0m [31m278.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo
  Downloading unsloth_zoo-2025.11.6-py3-none-any.whl.metadata (32 kB)
Downloading unsloth-2025.11.6-py3-none-any.whl (359 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.3/359.3 kB[0m [31m407.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth_zoo-2025.11.6-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.6/289.6 kB[0m [31m379.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unsloth_zoo, unsloth
  Attempting uninstall: unsloth_zoo
    Found existing installation: unsloth_zoo 2025.11.6
    Uninstalling unsloth_zoo-2025.11.6:
      Successfully uninstalled unsloth_zoo-2025.11.6
  Attempting uninstall: unsloth
    Found existing installation: unsloth 2025.11.6
    Un

In [None]:
from unsloth import FastLanguageModel

In [None]:
def generate_llama_clean(model, tokenizer, question, max_tokens=300):

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{SYSTEM_PROMPT}.<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

    FastLanguageModel.for_inference(model)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extraer SOLO la respuesta del asistente (después del último assistant header)
    if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
        parts = full_response.split("<|start_header_id|>assistant<|end_header_id|>")
        response = parts[-1].strip()
    else:
        response = full_response

    # Limpiar tokens residuales
    response = response.replace("<|eot_id|>", "").strip()
    response = response.replace("<|end_of_text|>", "").strip()

    # Remover cualquier header residual del prompt original
    lines = response.split('\n')
    clean_lines = []
    skip_next = False

    for line in lines:
        line_lower = line.lower().strip()
        # Saltar headers del sistema
        if any(header in line_lower for header in ['system', 'user', 'assistant', 'eres un asistente']):
            if line_lower in ['system', 'user', 'assistant']:
                skip_next = True
                continue
        if not skip_next:
            clean_lines.append(line)
        skip_next = False

    response = '\n'.join(clean_lines).strip()

    return response


In [None]:
def generate_response(message, history, max_tokens=512, temperature=0.7, top_p=0.9):
  return generate_llama_clean(model, tokenizer, message)

In [None]:
!pip install gradio



In [None]:
import gradio as gr

In [None]:
def create_interface():
    # Cargar el modelo al iniciar

    # Ejemplos de preguntas
    examples = [
        ["¿Cuáles son las variedades de café resistentes a la roya?"],
        ["¿Cuál es la productividad promedio de café en Colombia?"],
        ["¿Qué diferencias hay entre Cenicafé 1 y Castillo?"],
        ["¿Cuándo debo sembrar café en Antioquia?"],
        ["¿Cuáles son las 8 prácticas agronómicas fundamentales?"],
        ["¿Cuánto ahorra Colombia por usar variedades resistentes a roya?"]
    ]

    # Crear interfaz de chat
    demo = gr.ChatInterface(
        fn=generate_response,
        title="☕ Chatbot de Café Colombiano",
        description="""
        **Asistente experto en café colombiano** entrenado con información de Cenicafé.

        Pregúntame sobre:
        - Variedades de café resistentes a la roya
        - Prácticas agronómicas
        - Productividad y economía cafetera
        - Épocas de siembra por región
        - Manejo de enfermedades

        **Modelo:** unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit (Experimento 6)
        """,
        examples=examples
    )

    return demo

In [None]:
demo = create_interface()

print("\nLanzando aplicación...")
demo.launch(
    server_name="0.0.0.0",  # Permite acceso desde cualquier IP
    server_port=7860,        # Puerto por defecto de Gradio, cambiado a 7861
    share=False,             # Cambia a True para obtener un link público temporal
    show_error=True
)

  self.chatbot = Chatbot(



Lanzando aplicación...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

