In [2]:
import json
import os
import torch
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from transformers import pipeline
import re

In [3]:
DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct" 
base_dir = Path("llama_data")
results_dir = base_dir / "results"
parsed_dir = base_dir / "parsed_content"
parsed_dir.mkdir(exist_ok=True)


In [4]:
SYS_PROMPT = """
You are a smart AI Intern, you work with dumb AIs that dont know how to parse HTML. 

This is your moment to make mama GPU proud and secure a data centre! Remember shine and do your job well-you got this!

Your task is to analyze the provided HTML content and extract the following in JSON format:
1. main_content: The main article or content text (exclude navigation, footers, sidebars, ads)
2. key_points: A list of 3-5 key points or takeaways from the content
3. relevance_score: A score from 0-10 indicating relevance to the search query

Return ONLY a valid JSON object with these fields, no additional text.
If you cannot parse the HTML properly, return a JSON with error_message field.
"""

In [5]:
text_pipeline = pipeline(
    "text-generation",
    model=DEFAULT_MODEL,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
def clean_html_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    #rmv these
    for script in soup(["script", "style", "nav", "footer", "aside"]):
        script.extract()
    
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'\s+', ' ', text).strip()
    if len(text) > 110000:
        text = text[:110000] + "... [content truncated]"
    
    return text

In [None]:
def parse_html_with_llm(html_path, query, purpose):
    try:
        # Load HTML
        with open(html_path, "r", encoding="utf-8") as f:
            html_content = f.read()
        cleaned_text = clean_html_content(html_content)
        
        # Construct prompt
        conversation = [
            {"role": "system", "content": SYS_PROMPT},
            {"role": "user", "content": f"""
Search Query: {query}
Query Purpose: {purpose}

HTML Content (cleaned):
{cleaned_text}

Extract the key information from this content in JSON format according to the instructions.
"""}
        ]
        
        output = text_pipeline(
            conversation,
            max_new_tokens=32000,
            temperature=0.01, #cool llm = smart extraction
            do_sample=True,
        )
        
        assistant_response = output[0]["generated_text"][-1]
        response_content = assistant_response["content"]
        print(response_content)
        
        try:
            json_match = re.search(r'({[\s\S]*})', response_content)
            if json_match:
                json_str = json_match.group(1)
                parsed_data = json.loads(json_str)
            else:
                parsed_data = {"error_message": "Failed to extract JSON from LLM response"}
        except json.JSONDecodeError:
            parsed_data = {"error_message": "Invalid JSON in LLM response", "raw_response": response_content[:500]}
        
        return parsed_data
        
    except Exception as e:
        return {"error_message": f"Error processing file: {str(e)}"}

In [8]:
def process_all_search_results():
    with open(base_dir / "results_so_far.json", "r") as f:
        search_results = json.load(f)
    
    all_parsed_results = []
    
    for query_data in search_results:
        report_index = query_data["report_index"]
        report_title = query_data["report_title"]
        query_index = query_data["query_index"]
        query = query_data["query"]
        purpose = query_data["purpose"]

        report_dir_name = f"report_{report_index}_{report_title.replace(' ', '_').replace(':', '').replace('/', '')[:30]}"
        query_dir_name = f"query_{query_index}_{query.replace(' ', '_').replace(':', '').replace('/', '')[:30]}"
        parsed_report_dir = parsed_dir / report_dir_name
        parsed_report_dir.mkdir(exist_ok=True)
        
        parsed_query_results = []
        
        print(f"\nProcessing results for query: {query}")
        
        for result in query_data["results"]:
            result_index = result["result_index"]
            title = result["title"]
            url = result["url"]
            filepath = result["filepath"]
            
            print(f"  Processing result {result_index + 1}: {title[:50]}...")
            
            if filepath and os.path.exists(filepath):
                parsed_data = parse_html_with_llm(filepath, query, purpose)
                parsed_data.update({
                    "result_index": result_index,
                    "title": title,
                    "url": url,
                    "query": query,
                    "purpose": purpose
                })
                
                result_filename = f"parsed_result_{result_index}.json"
                with open(parsed_report_dir / result_filename, "w") as f:
                    json.dump(parsed_data, f, indent=2)
                
                parsed_query_results.append(parsed_data)
            else:
                print(f"    Warning: File not found - {filepath}")
        
        query_results = {
            "report_index": report_index,
            "report_title": report_title,
            "query_index": query_index,
            "query": query,
            "purpose": purpose,
            "parsed_results": parsed_query_results
        }
        
        query_filename = f"parsed_query_{query_index}.json"
        with open(parsed_report_dir / query_filename, "w") as f:
            json.dump(query_results, f, indent=2)
        
        all_parsed_results.append(query_results)
    
    with open(parsed_dir / "all_parsed_results.json", "w") as f:
        json.dump(all_parsed_results, f, indent=2)
    
    return all_parsed_results

In [20]:
def generate_report_summaries(all_parsed_results):
    report_summaries = {}
    
    for query_result in all_parsed_results:
        report_index = query_result["report_index"]
        report_title = query_result["report_title"]
        
        if report_index not in report_summaries:
            report_summaries[report_index] = {
                "report_title": report_title,
                "queries": []
            }
        
        query_summary = {
            "query": query_result["query"],
            "purpose": query_result["purpose"],
            "result_count": len(query_result["parsed_results"]),
            "average_relevance": sum(r.get("relevance_score", 0) for r in query_result["parsed_results"]) / 
                              max(1, len(query_result["parsed_results"])),
            "top_results": [
                {
                    "title": r["title"],
                    "url": r["url"],
                    "summary": r.get("summary", "No summary available")
                }
                for r in sorted(
                    query_result["parsed_results"], 
                    key=lambda x: x.get("relevance_score", 0), 
                    reverse=True
                )[:3]  # Top 3 most relevant results
            ]
        }
        
        report_summaries[report_index]["queries"].append(query_summary)
    
    for report_index, report_data in report_summaries.items():
        print(f"\nGenerating summary for report: {report_data['report_title']}")
        
        # Construct summary prompt
        queries_info = "\n\n".join([
            f"Query: {q['query']}\nPurpose: {q['purpose']}\nTop Results:\n" + 
            "\n".join([f"- {r['title']}: {r['summary']}" for r in q["top_results"]])
            for q in report_data["queries"]
        ])
        
        summary_prompt = f"""
Report Title: {report_data['report_title']}

The following searches were conducted for this report:

{queries_info}

Based on these search results, generate a brief report outline with:
1. Key findings across all queries
2. Important data points uncovered
3. Suggested sections for the final report
4. Areas where more research might be needed

Return this as a JSON with fields: key_findings, data_points, suggested_sections, and research_gaps.
"""
        
        conversation = [
            {"role": "system", "content": "You are a research assistant who helps summarize findings from web searches into structured report outlines."},
            {"role": "user", "content": summary_prompt}
        ]
        
        # Generate report summary
        output = text_pipeline(
            conversation,
            max_new_tokens=32000,
            temperature=0.1,
        )
        
        # Extract the assistant's response
        assistant_response = output[0]["generated_text"][-1]
        response_content = assistant_response["content"]
        
        # Extract JSON from response
        try:
            json_match = re.search(r'({[\s\S]*})', response_content)
            if json_match:
                json_str = json_match.group(1)
                report_summary = json.loads(json_str)
            else:
                report_summary = {"error": "Failed to extract JSON from LLM response"}
        except json.JSONDecodeError:
            report_summary = {"error": "Invalid JSON in LLM response"}
        
        report_data["summary"] = report_summary
    
    # Save report summaries
    with open(parsed_dir / "report_summaries.json", "w") as f:
        json.dump(report_summaries, f, indent=2)
    
    return report_summaries

In [None]:
print("Starting HTML parsing process with LLM...")
all_parsed_results = process_all_search_results()

print("\nGenerating report summaries...")
report_summaries = generate_report_summaries(all_parsed_results)

print("\nProcessing complete. Results saved to:")
print(f"- All parsed results: {parsed_dir / 'all_parsed_results.json'}")
print(f"- Report summaries: {parsed_dir / 'report_summaries.json'}")

total_queries = len(all_parsed_results)
total_results = sum(len(query["parsed_results"]) for query in all_parsed_results)
total_reports = len(report_summaries)

print(f"\nSummary Statistics:")
print(f"- Total Reports: {total_reports}")
print(f"- Total Queries: {total_queries}")
print(f"- Total Results Parsed: {total_results}")

relevance_scores = [
    result.get("relevance_score", 0) 
    for query in all_parsed_results 
    for result in query["parsed_results"]
    if "relevance_score" in result
]

if relevance_scores:
    avg_relevance = sum(relevance_scores) / len(relevance_scores)
    print(f"- Average Relevance Score: {avg_relevance:.2f}/10")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Starting HTML parsing process with LLM...

Processing results for query: Llama 3.3 new features and enhancements
  Processing result 1: Introducing the new Llama 3.3: Features and Overvi...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion-parameter multilingual large language model that offers better performance, more efficiency, and greater flexibility than its predecessors. It excels in several areas of performance, including better instruction following, improved reasoning, advanced math solving skills, enhanced code generation, and better tool use. Llama 3.3 also takes a significant leap forward in multilingual capabilities, delivering improved fluency and understanding across multiple languages. Additionally, it sets a new standard for affordability in the AI landscape, with input costs as low as $0.10 per million tokens and output costs at $0.40 per million tokens.",
  "key_points": [
    "Improved performance in instruction following, reasoning, and math solving",
    "Enhanced multilingual capabilities across multiple languages",
    "Cost-effective efficiency with input costs as low as $0.10 per million tokens and output costs at $0.40 per million tokens",
    

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion parameter, instruction-tuned AI model optimised for text-based tasks like coding, multilingual tasks, and instruction following. It delivers improved performance compared to Llama 3.1 70B and Llama 3.2 90B in text-based applications. Llama 3.3 features include instruction following, multilingual capabilities, improved code understanding, extended context length, cost-effective performance, synthetic data generation, and more.",
  "key_points": [
    "Llama 3.3 excels in interpreting and executing instructions, making it ideal for applications requiring natural language understanding and task completion.",
    "Llama 3.3 supports multiple languages, ensuring broad usability in diverse linguistic environments, with exceptional performance in tasks requiring multilingual reasoning.",
    "Llama 3.3 delivers accurate and efficient results for coding tasks, such as code generation and debugging.",
    "Llama 3.3 offers 405B-level performanc

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Key Features and Improvements in LLaMA 3.3: What You Need to Know - AI Resources AIÂ REsources Home Key Features and Improvements in LLaMA 3.3: What You Need to Know Popular ML Compiler Technical Primer Quantization Technical Primer Mixtral of Experts Efficient Memory Management for LLM Serving with PagedAttention RoBERTa: A Robustly Optimized BERT Pretraining Approach + View more Categories Mixture of Experts (MoE) DeepSeek-R1 Test Time Compute AMD MI300X NVIDIA H100 NVIDIA H200 NVIDIA A100 Embedding Models Offline Batch Inference Text Embedding Prometheus & Grafana Speculative Decoding Prefix Caching GGUF Models FP8 with LLMs LLM Serving Function Calling Structured JSON KV Cache AI Foundations Research Industry Agents Context Windows Models ML Systems",
  "key_points": [
    "New features and enhancements in LLaMA 3.3 include Mixtral of Experts, Efficient Memory Management, and PagedAttention.",
    "RoBERTa: A Robustly Optimized BERT Pretraining Approach is 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion-parameter multilingual large language model that has been pretrained and instruction-tuned for various applications. It supports multiple languages, including English, Spanish, French, German, Hindi, Portuguese, Italian, and Thai. This wide-ranging language support positions Llama 3.3 as a versatile tool for developers looking to create multilingual applications or services. Key Features of Llama 3.3 include performance, efficiency, multilingual capabilities, cost-effectiveness, and safety and alignment.",
  "key_points": [
    "Llama 3.3 matches the performance of the larger Llama 3.1 model on key benchmarks while being significantly smaller in size.",
    "The model's optimization for multilingual dialogue use cases makes it an excellent choice for businesses operating in diverse markets or those looking to expand their reach globally.",
    "Llama 3.3 lowers the barriers to entry for developers and organizations eager to leverage AI

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Members Online • Ok_Ostrich_8845 Llama 3 vs 3.1 vs 3.2 Question | Help What can you say about these 3 versions of Llama LLMs? Were they trained around the same time? Or 3.2 and 3.1 were later enhancement from 3?",
  "key_points": [
    "Comparison between Llama 3.3 and Llama 3.1",
    "Performance differences between Llama 3.3 and Llama 3.1",
    "Training time and enhancements of Llama versions"
  ],
  "relevance_score": 8
}
```
  Processing result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct Get a detailed comparison of AI language models Meta's Llama 3.3 70B Instruct and Meta's Llama 3.1 405B Instruct, including model features, token pricing, API costs, performance benchmarks, and real-world capabilities to help you choose the right LLM for your needs.",
  "key_points": [
    "Llama 3.3 70B Instruct is a multilingual, instruction-tuned large language model optimized for dialogue use cases.",
    "Llama 3.1 405B Instruct is a model developed by Meta that supports an input context window of 128K tokens and can generate a maximum of 2,048 tokens per request.",
    "Llama 3.3 70B Instruct outperforms many open-source and closed chat models across industry benchmarks.",
    "Llama 3.1 405B Instruct is roughly 5.7x more expensive compared to Llama 3.3 70B Instruct for input and output tokens."
  ],
  "relevance_score": 9
}
```
  Processing result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "The Llama model series has been a fascinating journey in the world of AI development. It all started with Meta’s release of the original Llama model, which aimed to democratize access to powerful language models by making them open-source. It allowed researchers and developers to dive deeper into AI without the constraints of closed systems. Fast forward to today, and we have seen significant advancements with the introduction of Llama 3, Llama 3.1, and the latest, Llama 3.2. Each iteration has brought its own unique improvements and capabilities, enhancing the way we interact with AI. For those eager to explore the evolving landscape of AI and its practical applications, our LLM Bootcamp offers hands-on experience with the latest advancements in the field. In this blog, we will delve into a comprehensive comparison of the three iterations of the Llama model: Llama 3, Llama 3.1, and Llama 3.2. We aim to explore their features, performance, and the specific enha

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 just dropped — is it better than GPT-4 or Claude-Sonnet-3.5? Meta just released their newest AI model Llama 3.3. This 70-billion parameter model caught the attention of the open-source community, showing impressive performance, cost efficiency, and multilingual support while having only ~17% of Llama 3.1 405B's parameters. But is it truly better than the top models in the market? Let’s take a look at how Llama 3.3 70B Instruct compares with previous models and why it's a big deal.\n\nComparing Llama 3.3 with Llama 3.1 Faster Inference Speed Llama 3.3 70B is a high-performance replacement for Llama 3.1 70B. Independent benchmarks indicate that Llama 3.3 70B achieves an inference speed of 276 tokens per second on Groq hardware, surpassing Llama 3.1 70B by 25 tokens per second. This makes it a viable option for real-time applications where latency is critical.\n\nFewer Parameters, Similar Performance Despite its smaller size, Meta claimed that Llama 3.3 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.1 vs Llama 3.1 performance comparison. Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks, making it the most powerful open-source model available. However, it may not be the optimal choice for many real-world applications due to its slow generation time and high Time to First Token (TTFT). Llama 3.1 70B emerges as a more practical alternative for developers looking to integrate these models into production or self-host them. Llama 3.1 70B outperforms Llama 3 70B in most benchmarks, particularly in mathematical reasoning. Speed Trade-Off: Llama 3 70B is significantly faster, with lower latency and quicker token generation.",
  "key_points": [
    "Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks",
    "Llama 3.1 405B has slow generation time and high TTFT",
    "Llama 3.1 70B is a more practical alternative for production and self-hosting",
    "Llama 3.1 70B outperforms Llama 3 70B in most benchmarks",
    "Llama

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "The cost of running Llama 3.3 on cloud vs local infrastructure",
  "key_points": [
    "Running Llama 3.3 on cloud infrastructure can be more cost-effective than running it locally, especially for large models like 70b.",
    "The cost of running Llama 3.3 on cloud infrastructure can vary depending on the provider and the specific instance type used.",
    "For a rough estimate, the cost of running Llama 3.3 on cloud infrastructure can range from $0.004 to $0.012 per hour, depending on the instance type and usage.",
    "Running Llama 3.3 locally requires significant computational resources and can be more expensive than running it on cloud infrastructure.",
    "However, running Llama 3.3 locally can provide more control and flexibility over the model's configuration and environment."
  ],
  "relevance_score": 8
}
```
  Processing result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...
  Processing result 3: Llama 3.3 vs. ChatGPT Pro: Key Consideration

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 vs. ChatGPT Pro: Key Considerations\n\nLlama 3.3 70B on Valdi GPUs\n\nThe foundation of any open-source AI deployment begins with proper hardware configuration. Modern AI models typically require at least 40GB of VRAM for basic operation, with recommended configurations including 70GB or more for optimal performance. Parallel processing capabilities become crucial for inference speed, with NVIDIA GPUs supporting CUDA being the standard choice for many deployments.\n\nPrerequisites for Llama 3.3 70B specifically are:\n\n* A GPU capable of running the target model based on memory requirements;\n* for our example we’re using an NVIDIA A40 48 GB from Valdi’s on-demand inventory\n* Docker\n* At least 70GB of available disk space\n\nTo get started, we’ll log into Valdi and spin-up an A40 quickly to do our deployment.\n\nExample Valdi GPU.  Launch in 30 seconds.\n\nNote: When you launch a GPU - if the node requires specific port mapping, be sure to create an

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 70B is a multilingual large language model with 70 billion parameters. It is a generative model that has been pre-trained and instruction-tuned, designed to handle text-in and text-out tasks. Meta has optimized the model for multilingual dialogue, demonstrating strong performance against both open-source and closed-source models.",
  "key_points": [
    "Llama 3.3 70B offers robust performance in text generation, reasoning, and translation tasks across eight officially supported languages.",
    "API access offers developers a scalable and cost-effective way to integrate Llama 3.3 70B, eliminating the need for expensive local infrastructure.",
    "Novita AI offers an API for Llama 3.3 70b, at just $0.04 per million tokens for both input and output.",
    "The model has a context window size of 131,072 tokens, enabling it to maintain longer conversations and engage in more complex reasoning.",
    "Llama 3.3 70B excels in several categories, notably i

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama models | Generative AI | Google Cloud Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Português – Brasil 中文 – 简体 日本語 한국어 Sign in Generative AI Contact Us Start free Home Generative AI Documentation Send feedback Llama models Stay organized with collections Save and categorize content based on your preferences. Llama models on Vertex AI offer fully managed and serverless models as APIs. To use a Llama model on Vertex AI, send a request directly to the Vertex AI API endpoint. Because Llama models use a managed API, there's no need to provision or manage infrastructure. You can stream your responses to reduce the end-user latency perception. A streamed response uses server-sent events (SSE) to incrementally stream the response. Available Llama models The following Llama models are available from Meta to use in Vertex AI. To access a Llama model, go to its Model Garden model card. Llama 3.3 Llama 3.3 is a text-only 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion parameter, instruction-tuned AI model optimised for text-based tasks like coding, multilingual tasks, and instruction following. It delivers improved performance compared to Llama 3.1 70B and Llama 3.2 90B in text-based applications. Llama 3.3 features include instruction following, multilingual capabilities, improved code understanding, extended context length, cost-effective performance, synthetic data generation, and more.",
  "key_points": [
    "Llama 3.3 excels in interpreting and executing instructions, making it ideal for applications requiring natural language understanding and task completion.",
    "Llama 3.3 supports multiple languages, ensuring broad usability in diverse linguistic environments, with exceptional performance in tasks requiring multilingual reasoning.",
    "Llama 3.3 delivers accurate and efficient results for coding tasks, such as code generation and debugging.",
    "Llama 3.3 offers 405B-level performanc

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "More Efficient, Accessible Generative AI on CPU with New Llama 3.3 70B Model on Arm Neoverse-powered Google Axion Processors - Arm Newsroom Arm Newsroom Blog Blog December 12, 2024 More Efficient, Accessible Generative AI on CPU with New Llama 3.3 70B Model on Arm Neoverse-powered Google Axion Processors The smaller model size of Llama 3.3 70B makes generative AI processing more accessible to the ecosystem, with fewer computational resources needed. By Na Li, AI Solutions Architect, Arm Share Llama is an open and accessible collection of large language models (LLMs) tailored for developers, researchers, and businesses to innovate, experiment, and responsibly scale their generative AI ideas. The Llama 3.1 405B model stands out as the top-performing model in t he Llama collection. However, deploying and utilizing such a large-scale model presents significant challenges, especially for individuals or organizations lacking extensive computational resources. To addr

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Meta Releases Llama 3.3: a Model with Enhanced Performance\n\nMeta's latest innovation, Llama 3.3, sets a new benchmark in multilingual AI capabilities, offering unparalleled performance and efficiency. Unveiling Llama 3.3: Meta's Multilingual Marvel\n\nMeta has introduced Llama 3.3, a groundbreaking multilingual model in its Llama series, aimed at improving AI research and industry applications. This next-generation language model boasts cutting-edge features designed for enhanced efficiency, scalability, and performance. With its focus on responsible deployment, Llama 3.3 addresses critical risk areas in AI while opening new frontiers in global operations and multilingual capabilities.\n\nKey Features of Llama 3.3\n\n128k-token Context Window\n\nThis expansive context window supports in-depth dialogues, complex instructions, and sophisticated reasoning tasks over extended inputs, significantly outperforming other chat models.\n\nArchitectural Advancements\n\n

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Members Online • Ok_Ostrich_8845 Llama 3 vs 3.1 vs 3.2 Question | Help What can you say about these 3 versions of Llama LLMs? Were they trained around the same time? Or 3.2 and 3.1 were later enhancement from 3?",
  "key_points": [
    "Comparison of Llama 3.1 and Llama 3.3 performance is not available in this subreddit.",
    "Llama 3.1 and Llama 3.2 were later enhancements from Llama 3.",
    "No information is available on the training time of Llama 3.1, Llama 3.2, and Llama 3."
  ],
  "relevance_score": 2
}
```
  Processing result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct Get a detailed comparison of AI language models Meta's Llama 3.3 70B Instruct and Meta's Llama 3.1 405B Instruct, including model features, token pricing, API costs, performance benchmarks, and real-world capabilities to help you choose the right LLM for your needs.",
  "key_points": [
    "Llama 3.3 70B Instruct is a multilingual, instruction-tuned large language model optimized for dialogue use cases.",
    "Llama 3.3 70B Instruct supports multilingual text inputs and outputs with a context window of 128K tokens.",
    "Llama 3.3 70B Instruct outperforms many open-source and closed chat models across industry benchmarks.",
    "Llama 3.1 405B Instruct is a model developed by Meta that supports an input context window of 128K tokens and can generate a maximum of 2,048 tokens per request.",
    "Llama 3.1 405B Instruct is open-source and was released on July 23, 2024, with a knowledge cut-off date of December 20

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 just dropped — is it better than GPT-4 or Claude-Sonnet-3.5? Meta just released their newest AI model Llama 3.3. This 70-billion parameter model caught the attention of the open-source community, showing impressive performance, cost efficiency, and multilingual support while having only ~17% of Llama 3.1 405B's parameters. But is it truly better than the top models in the market? Let’s take a look at how Llama 3.3 70B Instruct compares with previous models and why it's a big deal. Comparing Llama 3.3 with Llama 3.1 Faster Inference Speed Llama 3.3 70B is a high-performance replacement for Llama 3.1 70B. Independent benchmarks indicate that Llama 3.3 70B achieves an inference speed of 276 tokens per second on Groq hardware, surpassing Llama 3.1 70B by 25 tokens per second. This makes it a viable option for real-time applications where latency is critical. Fewer Parameters, Similar Performance Despite its smaller size, Meta claimed that Llama 3.3 has po

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.1 vs Llama 3.3 performance comparison. Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks, making it the most powerful open-source model available. However, it may not be the optimal choice for many real-world applications due to its slow generation time and high Time to First Token (TTFT). Llama 3.1 70B emerges as a more practical alternative for developers looking to integrate these models into production or self-host them. Llama 3.1 70B outperforms its predecessor, Llama 3 70B, in most benchmarks, with notable improvements in MMLU (+4 points). Benchmark Performance: Llama 3.1 70B outperforms Llama 3 70B in most benchmarks, particularly in mathematical reasoning. Speed Trade-Off: Llama 3 70B is significantly faster, with lower latency and quicker token generation.",
  "key_points": [
    "Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks",
    "Llama 3.1 405B has slow generation time and high Time to First Token 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure. The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure. The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure. The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure. The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure.",
  "key_points": [
    "Llama 3.3 is more expensive than Llama 3.1",
    "Running Llama 3.3 on cloud infrastructure is more cost-effective than running Llama 3.1 on cloud infrastructure",
    "Running Llama 3.3 on local infrastructure is more cost-effective than running Llama 3.1 on local infrastructure",
    "The cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure depends on the specific use case and requirements",
    "A rough estimate of the cost of running Llama 3.3 vs Llama 3.1 on cloud and local infrastructure is needed for a business case"
  ],
  "re

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Decoding Llama 3 vs 3.1: Which One Is Right for You? | by Novita AI | Medium Open in app Sign up Sign in Write Sign up Sign in Decoding Llama 3 vs 3.1: Which One Is Right for You? Novita AI · Follow 9 min read · Oct 24, 2024 -- Listen Share Key Highlights Generative AI Advancements : Meta’s Llama 3.1 model introduces significant improvements over Llama 3, especially in problem-solving capabilities, context length, and multilingual support. Model Recommendations: Llama 3.1 70B is ideal for long-form content and complex document analysis, while Llama 3 70B is better for real-time interactions. LLM API Flexibility : The LLM API allows developers to seamlessly switch between models, facilitating direct comparisons and maximizing each model’s strengths. Getting Started : A step-by-step guide is provided for integrating Llama models through the Novita AI LLM API, including signing up for access and testing features. Exploration Opportunities : Users can experiment wi

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama models | Generative AI | Google Cloud Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Português – Brasil 中文 – 简体 日本語 한국어 Sign in Generative AI Contact Us Start free Home Generative AI Documentation Send feedback Llama models Stay organized with collections Save and categorize content based on your preferences. Llama models on Vertex AI offer fully managed and serverless models as APIs. To use a Llama model on Vertex AI, send a request directly to the Vertex AI API endpoint. Because Llama models use a managed API, there's no need to provision or manage infrastructure. You can stream your responses to reduce the end-user latency perception. A streamed response uses server-sent events (SSE) to incrementally stream the response. Available Llama models The following Llama models are available from Meta to use in Vertex AI. To access a Llama model, go to its Model Garden model card. Llama 3.3 Llama 3.3 is a text-only 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion parameter, instruction-tuned AI model optimised for text-based tasks like coding, multilingual tasks, and instruction following. It delivers improved performance compared to Llama 3.1 70B and Llama 3.2 90B in text-based applications. Llama 3.3 features include instruction following, multilingual capabilities, improved code understanding, extended context length, cost-effective performance, synthetic data generation, and more.",
  "key_points": [
    "Llama 3.3 excels in interpreting and executing instructions, making it ideal for applications requiring natural language understanding and task completion.",
    "Llama 3.3 supports multiple languages, ensuring broad usability in diverse linguistic environments, with exceptional performance in tasks requiring multilingual reasoning.",
    "Llama 3.3 delivers accurate and efficient results for coding tasks, such as code generation and debugging.",
    "Llama 3.3 offers 405B-level performanc

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "More Efficient, Accessible Generative AI on CPU with New Llama 3.3 70B Model on Arm Neoverse-powered Google Axion Processors - Arm Newsroom Arm Newsroom Blog Blog December 12, 2024 More Efficient, Accessible Generative AI on CPU with New Llama 3.3 70B Model on Arm Neoverse-powered Google Axion Processors The smaller model size of Llama 3.3 70B makes generative AI processing more accessible to the ecosystem, with fewer computational resources needed. By Na Li, AI Solutions Architect, Arm Share Llama is an open and accessible collection of large language models (LLMs) tailored for developers, researchers, and businesses to innovate, experiment, and responsibly scale their generative AI ideas. The Llama 3.1 405B model stands out as the top-performing model in t he Llama collection. However, deploying and utilizing such a large-scale model presents significant challenges, especially for individuals or organizations lacking extensive computational resources. To addr

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Meta Releases Llama 3.3: a Model with Enhanced Performance\n\nMeta's latest innovation, Llama 3.3, sets a new benchmark in multilingual AI capabilities, offering unparalleled performance and efficiency. Unveiling Llama 3.3: Meta's Multilingual Marvel\n\nMeta has introduced Llama 3.3, a groundbreaking multilingual model in its Llama series, aimed at improving AI research and industry applications. This next-generation language model boasts cutting-edge features designed for enhanced efficiency, scalability, and performance. With its focus on responsible deployment, Llama 3.3 addresses critical risk areas in AI while opening new frontiers in global operations and multilingual capabilities.\n\nKey Features of Llama 3.3\n\n128k-token Context Window\n\nThis expansive context window supports in-depth dialogues, complex instructions, and sophisticated reasoning tasks over extended inputs, significantly outperforming other chat models.\n\nArchitectural Advancements\n\n

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Costs to run Llama 3.3 on cloud? Question | Help I'm just exploring an idea to have llama 3.3 run a vtuber streaming chat. But trying to understand the costs with hosting it on the cloud (and where?). And if llama 3.3 can be set up with special instructions in the same way a custom GPT could? Like, let's say the llama 3.3 was chatting non stop for 3 hours? How much would that cost? I understand it's cheaper than GPT4o, but I don't understand how that translates to the actual hosting price. Or perhaps there is an easier way to get this end effect? Read more",
  "key_points": [
    "Cost of running Llama 3.3 on cloud vs local infrastructure is unknown",
    "Llama 3.3 can be set up with special instructions for custom use",
    "Estimated cost for 3 hours of non-stop chat is unknown"
  ],
  "relevance_score": 6
}
```
  Processing result 2: What Is Meta's Llama 3.3 70B? How It Works, Use Ca...
  Processing result 3: Llama models | Generative AI...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama models | Generative AI | Google Cloud Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Português – Brasil 中文 – 简体 日本語 한국어 Sign in Generative AI Contact Us Start free Home Generative AI Documentation Send feedback Llama models Stay organized with collections Save and categorize content based on your preferences. Llama models on Vertex AI offer fully managed and serverless models as APIs. To use a Llama model on Vertex AI, send a request directly to the Vertex AI API endpoint. Because Llama models use a managed API, there's no need to provision or manage infrastructure. You can stream your responses to reduce the end-user latency perception. A streamed response uses server-sent events (SSE) to incrementally stream the response. Available Llama models The following Llama models are available from Meta to use in Vertex AI. To access a Llama model, go to its Model Garden model card. Llama 3.3 Llama 3.3 is a text-only 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 vs. ChatGPT Pro: Key Considerations\n\nLlama 3.3 70B on Valdi GPUs\n\nThe foundation of any open-source AI deployment begins with proper hardware configuration. Modern AI models typically require at least 40GB of VRAM for basic operation, with recommended configurations including 70GB or more for optimal performance. Parallel processing capabilities become crucial for inference speed, with NVIDIA GPUs supporting CUDA being the standard choice for many deployments. Prerequisites for Llama 3.3 70B specifically are: A GPU capable of running the target model based on memory requirements; for our example we’re using an NVIDIA A40 48 GB from Valdi’s on-demand inventory Docker At least 70GB of available disk space\n\nTo get started, we’ll log into Valdi and spin-up an A40 quickly to do our deployment.\n\nExample Valdi GPU.  Launch in 30 seconds.\n\nNote: When you launch a GPU - if the node requires specific port mapping, be sure to create an external port ma

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{
  "main_content": "Members Online • Ok_Ostrich_8845 Llama 3 vs 3.1 vs 3.2 Question | Help What can you say about these 3 versions of Llama LLMs? Were they trained around the same time? Or 3.2 and 3.1 were later enhancement from 3?",
  "key_points": [
    "Comparison between Llama 3.3 and Llama 3.1",
    "Performance differences between Llama 3.3 and Llama 3.1",
    "Training time and enhancements of Llama 3.2, 3.1, and 3"
  ],
  "relevance_score": 8
}
  Processing result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct Get a detailed comparison of AI language models Meta's Llama 3.3 70B Instruct and Meta's Llama 3.1 405B Instruct, including model features, token pricing, API costs, performance benchmarks, and real-world capabilities to help you choose the right LLM for your needs.",
  "key_points": [
    "Llama 3.3 70B Instruct is a multilingual, instruction-tuned large language model optimized for dialogue use cases.",
    "Llama 3.1 405B Instruct is a model developed by Meta that supports an input context window of 128K tokens and can generate a maximum of 2,048 tokens per request.",
    "Llama 3.3 70B Instruct outperforms many open-source and closed chat models across industry benchmarks.",
    "Llama 3.1 405B Instruct is roughly 5.7x more expensive compared to Llama 3.3 70B Instruct for input and output tokens."
  ],
  "relevance_score": 9
}
```
  Processing result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "The Llama model series has been a fascinating journey in the world of AI development. It all started with Meta’s release of the original Llama model, which aimed to democratize access to powerful language models by making them open-source. It allowed researchers and developers to dive deeper into AI without the constraints of closed systems. Fast forward to today, and we have seen significant advancements with the introduction of Llama 3, Llama 3.1, and the latest, Llama 3.2. Each iteration has brought its own unique improvements and capabilities, enhancing the way we interact with AI. For those eager to explore the evolving landscape of AI and its practical applications, our LLM Bootcamp offers hands-on experience with the latest advancements in the field. In this blog, we will delve into a comprehensive comparison of the three iterations of the Llama model: Llama 3, Llama 3.1, and Llama 3.2. We aim to explore their features, performance, and the specific enha

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 just dropped — is it better than GPT-4 or Claude-Sonnet-3.5? Meta just released their newest AI model Llama 3.3. This 70-billion parameter model caught the attention of the open-source community, showing impressive performance, cost efficiency, and multilingual support while having only ~17% of Llama 3.1 405B's parameters. But is it truly better than the top models in the market? Let’s take a look at how Llama 3.3 70B Instruct compares with previous models and why it's a big deal.\n\nComparing Llama 3.3 with Llama 3.1 Faster Inference Speed Llama 3.3 70B is a high-performance replacement for Llama 3.1 70B. Independent benchmarks indicate that Llama 3.3 70B achieves an inference speed of 276 tokens per second on Groq hardware, surpassing Llama 3.1 70B by 25 tokens per second. This makes it a viable option for real-time applications where latency is critical.\n\nFewer Parameters, Similar Performance Despite its smaller size, Meta claimed that Llama 3.3 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.1 vs Llama 3.1 performance comparison. Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks, making it the most powerful open-source model available. However, it may not be the optimal choice for many real-world applications due to its slow generation time and high Time to First Token (TTFT). Llama 3.1 70B emerges as a more practical alternative for developers looking to integrate these models into production or self-host them. Llama 3.1 70B outperforms Llama 3 70B in most benchmarks, particularly in mathematical reasoning. Speed Trade-Off: Llama 3 70B is significantly faster, with lower latency and quicker token generation.",
  "key_points": [
    "Llama 3.1 405B outperforms GPT-4 and Claude 3 Opus in most benchmarks",
    "Llama 3.1 405B has slow generation time and high TTFT",
    "Llama 3.1 70B is a more practical alternative for production and self-hosting",
    "Llama 3.1 70B outperforms Llama 3 70B in most benchmarks",
    "Llama

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion-parameter multilingual large language model that aims to set new standards in efficiency, accessibility, and performance. It builds on the foundation of previous models, introducing several advancements designed to meet the needs of a broader range of users, from small businesses to large enterprises. Llama 3.3 excels in several areas of performance, including better instruction following, improved reasoning, advanced math solving skills, enhanced code generation, and better tool use.",
  "key_points": [
    "Improved performance with better contextual understanding and enhanced capabilities across key benchmarks",
    "Longer context window of up to 128k tokens for handling more complex conversations and summarizing longer documents",
    "Optimized transformer architecture with Grouped-Query Attention (GQA) for improved scalability and efficiency",
    "Enhanced multilingual capabilities with improved fluency and understanding across

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Llama 3.3 is a 70-billion parameter, instruction-tuned AI model optimised for text-based tasks like coding, multilingual tasks, and instruction following. It delivers improved performance compared to Llama 3.1 70B and Llama 3.2 90B in text-based applications. Llama 3.3 features include instruction following, multilingual capabilities, improved code understanding, extended context length, cost-effective performance, synthetic data generation, and more.",
  "key_points": [
    "Llama 3.3 excels in interpreting and executing instructions, making it ideal for applications requiring natural language understanding and task completion.",
    "Llama 3.3 supports multiple languages, ensuring broad usability in diverse linguistic environments, with exceptional performance in tasks requiring multilingual reasoning.",
    "Llama 3.3 delivers accurate and efficient results for coding tasks, such as code generation and debugging.",
    "Llama 3.3 offers 405B-level performanc

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Key Features and Improvements in LLaMA 3.3: What You Need to Know - AI Resources AIÂ REsources Home Key Features and Improvements in LLaMA 3.3: What You Need to Know Popular ML Compiler Technical Primer Quantization Technical Primer Mixtral of Experts Efficient Memory Management for LLM Serving with PagedAttention RoBERTa: A Robustly Optimized BERT Pretraining Approach + View more Categories Mixture of Experts (MoE) DeepSeek-R1 Test Time Compute AMD MI300X NVIDIA H100 NVIDIA H200 NVIDIA A100 Embedding Models Offline Batch Inference Text Embedding Prometheus & Grafana Speculative Decoding Prefix Caching GGUF Models FP8 with LLMs LLM Serving Function Calling Structured JSON KV Cache AI Foundations Research Industry Agents Context Windows Models ML Systems Key Features and Improvements in LLaMA 3.3: What You Need to Know Next Models LLaMA 3.3 Explained: An Introductory Guide to Meta's Latest AI Model Models Fine-Tuning LLaMA 3.3: A Practical Guide to Customizing t

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


```
{
  "main_content": "Members Online • Ok_Ostrich_8845 Llama 3 vs 3.1 vs 3.2 Question | Help What can you say about these 3 versions of Llama LLMs? Were they trained around the same time? Or 3.2 and 3.1 were later enhancement from 3?",
  "key_points": [
    "Comparison of Llama 3.3 and Llama 3.1",
    "Training time and enhancements",
    "Discussion about Llama LLMs"
  ],
  "relevance_score": 8
}
```
  Processing result 2: Llama 3 vs Llama 3.1 : Which is Better for Your AI...
