### **🔧 Environment Setup**

*  Install required libraries once at runtime.
*  These are intentionally separated from model logic
* to keep dependency management clear and reproducible.





In [None]:
!pip install transformers torch accelerate
!pip install -q huggingface_hub




### **🔐 Secure HuggingFace Authentication**
*   Token is requested at runtime instead of hardcoding it.
*   This prevents credential leakage when sharing notebooks
* and follows production security practices.



In [2]:
import os
from huggingface_hub import login, whoami

def authenticate_huggingface(allow_interactive=True):

    token = None

    # ------------------------------------------------------
    # 1️⃣ ENV VARIABLE (best practice)
    # ------------------------------------------------------
    token = os.getenv("HF_TOKEN")

    # ------------------------------------------------------
    # 2️⃣ GOOGLE COLAB SECRETS
    # ------------------------------------------------------
    if not token:
        try:
            from google.colab import userdata
            token = userdata.get("HF_TOKEN")

            # ⭐ IMPORTANT FIX: handle dict return
            if isinstance(token, dict):
                token = token.get("value")

            if token:
                print("🔐 Using HuggingFace token from Colab Secrets")

        except Exception:
            pass

    # ------------------------------------------------------
    # 3️⃣ INTERACTIVE FALLBACK
    # ------------------------------------------------------
    if not token and allow_interactive:
        try:
            from getpass import getpass
            token = getpass("Enter HuggingFace token (hidden input): ")
        except Exception:
            pass

    # ------------------------------------------------------
    # 4️⃣ LOGIN FLOW
    # ------------------------------------------------------
    if token:
        try:
            # ⭐ Skip login if session already authenticated
            try:
                whoami()
                print("✅ Already authenticated with HuggingFace")
            except Exception:
                login(token)

            # Confirm user
            try:
                username = whoami()["name"]
                print(f"✅ HuggingFace login successful → {username}")
            except Exception:
                print("✅ HuggingFace login successful")

        except Exception as e:
            raise RuntimeError(
                "❌ HuggingFace authentication failed.\n"
                "Check that your token is valid and has model access.\n"
                f"Error: {str(e)}"
            )

    else:
        print(
            "⚠️ No HuggingFace token detected.\n"
            "If the model is gated, authentication is required.\n\n"
            "Fix:\n"
            "• Set env var: HF_TOKEN=your_token\n"
            "• OR add token in Colab Secrets\n"
        )


# Call before model loading
authenticate_huggingface()


🔐 Using HuggingFace token from Colab Secrets
✅ Already authenticated with HuggingFace
✅ HuggingFace login successful → NY0641


### **🖥️ Hardware Check (GPU + RAM)**
*This cell defines helper functions used during model initialization.

- `get_device_config()` decides whether the model runs on GPU or CPU.
- `safe_model_load()` prevents the notebook from hanging if model loading fails or takes too long.

These utilities keep the loading logic clean

In [6]:
import torch

def resolve_runtime_config(min_vram_gb=10):
    """
    Decide device placement and precision automatically.

    Strategy:
    - Use GPU if available AND VRAM sufficient
    - Otherwise fall back to CPU
    - Never require manual setup

    Returns:
        torch_dtype, device_map
    """

    # -------------------------
    # GPU available?
    # -------------------------
    if torch.cuda.is_available():

        total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9

        # GPU exists but too small → fallback to CPU
        if total_vram < min_vram_gb:
            return torch.float32, "cpu"

        # GPU sufficient → use it
        return torch.float16, "auto"

    # -------------------------
    # No GPU → CPU fallback
    # -------------------------
    return torch.float32, "cpu"


## ⏱️ Safe Model Loader

Loads the model with timeout protection.
Prevents notebook from hanging if resources are insufficient.


In [7]:
import time

def safe_model_load(load_fn, max_load_time=300):
    """
    Runs model loading with timeout protection.

    Raises:
        RuntimeError if loading fails or takes too long.
    """

    start = time.time()

    try:
        result = load_fn()

        elapsed = time.time() - start
        if elapsed > max_load_time:
            raise RuntimeError(
                "Model loading took too long. "
                "You may be running on CPU or low resources."
            )

        return result

    except Exception as e:
        raise RuntimeError(f"Model failed to load: {str(e)}")


### **Model Intialization**

In [8]:
from transformers import AutoTokenizer, pipeline

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

# 🔹 Resolve runtime automatically
torch_dtype, device_map = resolve_runtime_config()

# 🔹 Define loading function
def load_pipeline():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    pipe = pipeline(
        "text-generation",
        model=MODEL_NAME,
        tokenizer=tokenizer,
        torch_dtype=torch_dtype,
        device_map=device_map
    )

    return tokenizer, pipe

# 🔹 Run with timeout protection
tokenizer, llama_pipeline = safe_model_load(load_pipeline)


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Pipeline is setup, Now we need to create a chatbot using llama

In [24]:
MAX_INPUT_TOKENS = 3000   # safe budget for prompt

def count_tokens(messages):
    """
    Convert conversation to tokenized prompt and measure length.
    """
    tokens = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    return tokens["input_ids"].shape[-1]


In [25]:
def truncate_history(messages):
    """
    Remove oldest user/assistant messages until prompt fits token budget.
    Always keeps the system prompt.
    """
    while count_tokens(messages) > MAX_INPUT_TOKENS and len(messages) > 1:
        messages.pop(1)   # remove oldest non-system message
    return messages


In [28]:
messages = [{
    "role":"system",
    "content":"You are a helpful, smart and friendly AI assistant. Do not leave the sentence incomplete."
}]

def get_llama_response(user_input: str) -> str:
    global messages

    # Add user message
    messages.append({"role":"user","content":user_input})

    # 🔹 Token-based truncation happens here
    messages = truncate_history(messages)

    # Convert chat into model prompt
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,# telling llama not to give output as tokens, intead words
        add_generation_prompt=True # tells the model its assistant turn to reply
    )

    # Generate response
    output = llama_pipeline(
        prompt,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=5,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )

    full_text = output[0]["generated_text"]
    reply = full_text[len(prompt):].strip()

    # Save assistant reply
    messages.append({"role":"assistant","content":reply})

    return reply


In [29]:
# CLI loop
def run_chatbot():
  print("Llama chatbot is ready")

  while True:
    user_input = input("user: ")

    if user_input.lower() == "exit":
      print("chat ended.")
      break

    reply = get_llama_response(user_input)

    print("Chatbot: ",reply)

if __name__ == "__main__":
  run_chatbot()

Llama chatbot is ready
user: Hi'


Passing `generation_config` together with generation-related arguments=({'eos_token_id', 'max_new_tokens', 'top_k', 'top_p', 'temperature', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=256) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Chatbot:  Hello there! *smiling* It's nice to meet you! How can I assist you today? Is there something you need help with or want to chat about? I'm here to listen and help in any way I can. 😊
user: Are you finally runnig?


Both `max_new_tokens` (=256) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Chatbot:  Oh, my apologies! I'm just an AI, I don't have the ability to run or move around physically. I exist solely as a digital entity, so I don't have a physical body that can run or move in the way that humans do. Is there anything else I can help you with? 😊
user: exit
chat ended.
