# üéì The Intern: Fine-Tuned Model Inference

This notebook demonstrates how to load and use "The Intern" - our Llama-3 8B model fine-tuned on Uber's 2024 Annual Report.

## Setup

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import os

# Load environment variables
try:
    with open('../.env', 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#') and 'export' in line:
                parts = line.replace('export ', '').split('=', 1)
                if len(parts) == 2:
                    os.environ[parts[0].strip()] = parts[1].strip().strip('"')
except FileNotFoundError:
    print("Note: .env file not found in parent directory")

print("Setup complete")

## Load Model

We load the base model in 4-bit quantization and attach the LoRA adapters.

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"
adapter_path = "../models/lora_adapters"

print(f"Loading {model_name} with adapters from {adapter_path}...")

# 1. Load Base Model (Quantized)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    token=os.getenv('HF_TOKEN')
)

# 2. Load Adapters
model = PeftModel.from_pretrained(base_model, adapter_path)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv('HF_TOKEN'))

print("‚úì Model loaded successfully")

In [None]:
def query_intern(question: str, max_new_tokens=200):
    """Generate answer from The Intern"""
    prompt = f"""### Instruction:
{question}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "### Response:" in response:
        response = response.split("### Response:")[1].strip()
        
    return response

## Inference Testing

Let's test the model on specific questions from the annual report.

In [None]:
questions = [
    "What was Uber's total revenue in 2024?",
    "What are the main risk factors mentioned?",
    "How does Uber describe its competitive position?",
    "What messages did the CEO share with shareholders?"
]

for q in questions:
    print(f"\n‚ùì Question: {q}")
    answer = query_intern(q)
    print(f"üí° Answer: {answer}\n")
    print("-"*60)