### **üîß Environment Setup**

*  Install required libraries once at runtime.
*  These are intentionally separated from model logic
* to keep dependency management clear and reproducible.





In [None]:
!pip install transformers torch accelerate
!pip install -q huggingface_hub


### **üîê Secure HuggingFace Authentication**
*   Token is requested at runtime instead of hardcoding it.
*   This prevents credential leakage when sharing notebooks
* and follows production security practices.



In [None]:
import os
from huggingface_hub import login, whoami

def authenticate_huggingface(allow_interactive=True):

    token = None

    # ------------------------------------------------------
    # 1Ô∏è‚É£ ENV VARIABLE (best practice)
    # ------------------------------------------------------
    token = os.getenv("HF_TOKEN")

    # ------------------------------------------------------
    # 2Ô∏è‚É£ GOOGLE COLAB SECRETS
    # ------------------------------------------------------
    if not token:
        try:
            from google.colab import userdata
            token = userdata.get("HF_TOKEN")

            # ‚≠ê IMPORTANT FIX: handle dict return
            if isinstance(token, dict):
                token = token.get("value")

            if token:
                print("üîê Using HuggingFace token from Colab Secrets")

        except Exception:
            pass

    # ------------------------------------------------------
    # 3Ô∏è‚É£ INTERACTIVE FALLBACK
    # ------------------------------------------------------
    if not token and allow_interactive:
        try:
            from getpass import getpass
            token = getpass("Enter HuggingFace token (hidden input): ")
        except Exception:
            pass

    # ------------------------------------------------------
    # 4Ô∏è‚É£ LOGIN FLOW
    # ------------------------------------------------------
    if token:
        try:
            # ‚≠ê Skip login if session already authenticated
            try:
                whoami()
                print("‚úÖ Already authenticated with HuggingFace")
            except Exception:
                login(token)

            # Confirm user
            try:
                username = whoami()["name"]
                print(f"‚úÖ HuggingFace login successful ‚Üí {username}")
            except Exception:
                print("‚úÖ HuggingFace login successful")

        except Exception as e:
            raise RuntimeError(
                "‚ùå HuggingFace authentication failed.\n"
                "Check that your token is valid and has model access.\n"
                f"Error: {str(e)}"
            )

    else:
        print(
            "‚ö†Ô∏è No HuggingFace token detected.\n"
            "If the model is gated, authentication is required.\n\n"
            "Fix:\n"
            "‚Ä¢ Set env var: HF_TOKEN=your_token\n"
            "‚Ä¢ OR add token in Colab Secrets\n"
        )


# Call before model loading
authenticate_huggingface()


### **üñ•Ô∏è Hardware Check (GPU + RAM)**
*This cell defines helper functions used during model initialization.

- `get_device_config()` decides whether the model runs on GPU or CPU.
- `safe_model_load()` prevents the notebook from hanging if model loading fails or takes too long.

These utilities keep the loading logic clean

In [None]:
import torch

def resolve_runtime_config(min_vram_gb=10):
    """
    Decide device placement and precision automatically.

    Strategy:
    - Use GPU if available AND VRAM sufficient
    - Otherwise fall back to CPU
    - Never require manual setup

    Returns:
        torch_dtype, device_map
    """

    # -------------------------
    # GPU available?
    # -------------------------
    if torch.cuda.is_available():

        total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9

        # GPU exists but too small ‚Üí fallback to CPU
        if total_vram < min_vram_gb:
            return torch.float32, "cpu"

        # GPU sufficient ‚Üí use it
        return torch.float16, "auto"

    # -------------------------
    # No GPU ‚Üí CPU fallback
    # -------------------------
    return torch.float32, "cpu"


## ‚è±Ô∏è Safe Model Loader

Loads the model with timeout protection.
Prevents notebook from hanging if resources are insufficient.


In [None]:
import time

def safe_model_load(load_fn, max_load_time=300):
    """
    Runs model loading with timeout protection.

    Raises:
        RuntimeError if loading fails or takes too long.
    """

    start = time.time()

    try:
        result = load_fn()

        elapsed = time.time() - start
        if elapsed > max_load_time:
            raise RuntimeError(
                "Model loading took too long. "
                "You may be running on CPU or low resources."
            )

        return result

    except Exception as e:
        raise RuntimeError(f"Model failed to load: {str(e)}")


### **Model Intialization**

In [None]:
from transformers import AutoTokenizer, pipeline

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

# üîπ Resolve runtime automatically
torch_dtype, device_map = resolve_runtime_config()

# üîπ Define loading function
def load_pipeline():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    pipe = pipeline(
        "text-generation",
        model=MODEL_NAME,
        tokenizer=tokenizer,
        torch_dtype=torch_dtype,
        device_map=device_map
    )

    return tokenizer, pipe

# üîπ Run with timeout protection
tokenizer, llama_pipeline = safe_model_load(load_pipeline)


Pipeline is setup, Now we need to create a chatbot using llama

In [None]:
MAX_INPUT_TOKENS = 3000   # safe budget for prompt

def count_tokens(messages):
    """
    Convert conversation to tokenized prompt and measure length.
    """
    tokens = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    return tokens["input_ids"].shape[-1]


In [None]:
def truncate_history(messages):
    """
    Remove oldest user/assistant messages until prompt fits token budget.
    Always keeps the system prompt.
    """
    while count_tokens(messages) > MAX_INPUT_TOKENS and len(messages) > 1:
        messages.pop(1)   # remove oldest non-system message
    return messages


In [None]:
messages = [{
    "role":"system",
    "content":"You are a helpful, smart and friendly AI assistant. Do not leave the sentence incomplete."
}]

def get_llama_response(user_input: str) -> str:
    global messages

    # Add user message
    messages.append({"role":"user","content":user_input})

    # üîπ Token-based truncation happens here
    messages = truncate_history(messages)

    # Convert chat into model prompt
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,# telling llama not to give output as tokens, intead words
        add_generation_prompt=True # tells the model its assistant turn to reply
    )

    # Generate response
    output = llama_pipeline(
        prompt,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=5,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )

    full_text = output[0]["generated_text"]
    reply = full_text[len(prompt):].strip()

    # Save assistant reply
    messages.append({"role":"assistant","content":reply})

    return reply


In [None]:
# CLI loop
def run_chatbot():
  print("Llama chatbot is ready")

  while True:
    user_input = input("user: ")

    if user_input.lower() == "exit":
      print("chat ended.")
      break

    reply = get_llama_response(user_input)

    print("Chatbot: ",reply)

if __name__ == "__main__":
  run_chatbot()