# Initialization

In [None]:
import logging
import boto3
import json
import pandas as pd
import subprocess
import nltk
import lancedb
from IPython.display import display, HTML
from ipywidgets import IntProgress
from amzn_personal_playground import (
    Model,
    cosine_similarity,
    rouge_score,
)
from datasets import load_dataset

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers.regex import RegexParser
from langchain_aws import ChatBedrock
from langchain_community.llms import Bedrock
from langchain.prompts import PromptTemplate
from langchain.embeddings import BedrockEmbeddings


logging.basicConfig(level=logging.INFO)

# Initialize SDK clients, in case we want to interact with Bedrock directly instead of using LangChain
session = boto3.Session(profile_name="bedrock")
bedrock_runtime = session.client("bedrock-runtime", region_name="us-west-2")
# Uncomment this line and use this client for Titan model. You must be in VPN or Amazon's CORP network to run this line.
# bedrock_runtime_private = session.client("bedrock-runtime", region_name="us-west-2", endpoint_url="https://prod.us-west-2.dataplane.bedrock.aws.dev")

nltk.download("punkt")
nltk.download("stopwords")


# Personal playground

Demonstrating basic interaction with Bedrock models, using either AWS SDK directly or LangChain. For each option, we are testing three different scenarios: prompting without a context, prompting with a static context, and prompting with context pulled from a vector store. 

In [None]:
context = """The instruction set architecture of a Lambda function determines the type of computer
processor that Lambda uses to run the function. Lambda provides a choice of instruction set
architectures:
    arm64 – 64-bit ARM architecture, for the AWS Graviton2 processor.
    x86_64 – 64-bit x86 architecture, for x86-based processors.
"""

question = "What architectures does Lambda support?"


## Using AWS SDK directly

### Prompt without a context

Without a helpful context and instructions, there's good chance that the answers from LLM will be inaccurate.

In [None]:
prompt = """Human:

{question}

Assistant:

"""

response = bedrock_runtime.invoke_model(
    body=json.dumps({
        "prompt": prompt.format(question=question),
        "max_tokens_to_sample": 4096,
        "temperature": 0.0,
        "top_k": 10,
        "top_p": 1.0,
    }),
    modelId=Model.ANTHROPIC_CLAUDE_INSTANT.value,
)
print(json.loads(response.get("body").read())["completion"])


### Prompt with proper context and prompt engineering

With a relevant context included in the prompt, the LLM should be able to provide an accurate answer

In [None]:
prompt = """Human:

You act as an AWS Cloud Practitioner and only answer questions about AWS. Read the user’s question
supplied within the <question> tags. Then, use the contextual information provided above within the
<context> tags to provide an answer in <answer> tag. Do not repeat the context. Respond that you
don't know if you don't have enough information  to answer.

<context>
{context}
</context>

<question>
{question}
</question>

Assistant:


"""

response = bedrock_runtime.invoke_model(
    body=json.dumps({
        "prompt": prompt.format(question=question, context=context),
        "max_tokens_to_sample": 4096,
        "temperature": 0.0,
        "top_k": 10,
        "top_p": 1.0,
    }),
    modelId=Model.ANTHROPIC_CLAUDE_INSTANT.value,
)
print(json.loads(response.get("body").read())["completion"])


### Prompt with Retrieval-augmented generation (RAG)

RAG allows pulling contexts (that are relevant to the provided question) from a vector store. A typical RAG application has two main components:

1. Indexing
  * Load: First we need to load our data. We'll use DocumentLoaders for this.
  * Split: Text splitters break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won't in a model's finite context window.
  * Store: We need somewhere to store and index our splits, so that they can later be searched over. This is often done using a VectorStore and Embeddings model.

2. Querying
  * Retrieve context: Given a user input, relevant splits are retrieved from storage using a Retriever.  
  * Generate from context: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data

In [None]:
prompt = """Human:
You act as an AWS Cloud Practitioner and only answer questions about AWS. Read the user’s question
supplied within the <question> tags. Then, use the contextual information provided above within the
<context> tags to provide an answer in <answer> tag. Do not repeat the context. Respond that you
don't know if you don't have enough information  to answer.

<context>
{context}
</context>

<question>
{question}
</question>

Assistant:


"""

# Indexing - load
package_root = subprocess.run(["brazil-context", "package", "root"], capture_output=True).stdout.decode().strip()
loader = TextLoader(f"{package_root}/example_data/lambda_deploy_to_multiple_architectures.txt")
data = loader.load()

# Indexing - split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

# Indexing - store: converting texts to embeddings
response = bedrock_runtime.invoke_model(
    body=json.dumps({
        "texts": list(map(lambda x: x.page_content, all_splits)),
        # input type of "search_document" is used when we want to generate embeddings
        # to store in a vector database for search use-cases.
        "input_type": "search_document",
    }),
    modelId=Model.COHERE_EMBED_ENGLISH.value,
)
embeddings = json.loads(response.get("body").read())["embeddings"]

# Indexing - store: storing embeddings
vector_db = lancedb.connect("/tmp/lancedb")
data = [{
    "vector": embeddings[i],
    "text": all_splits[i].page_content,
    "id": i,
} for i in range(len(all_splits))]
vector_table = vector_db.create_table("rag-data", data=data, mode="overwrite")

# Querying - retrieve context
response = bedrock_runtime.invoke_model(
    body=json.dumps({
        "texts": [question],
        # input type of "search_document" is used when we want to generate embeddings
        # to store in a vector database for search use-cases.
        "input_type": "search_document",
    }),
    modelId=Model.COHERE_EMBED_ENGLISH.value,
)
question_embedding = json.loads(response.get("body").read())["embeddings"][0]
relevant_chunks = vector_table.search(question_embedding)
relevant_chunk_ids = [chunk["id"] for chunk in relevant_chunks.to_list()]

# Querying - generate content
context = "\n".join([
    all_splits[int(split_id)].page_content
    for split_id in relevant_chunk_ids
])


response = bedrock_runtime.invoke_model(
    body=json.dumps({
        "prompt": prompt.format(question=question, context=context),
        "max_tokens_to_sample": 4096,
        "temperature": 0.0,
        "top_k": 10,
        "top_p": 1.0,
    }),
    modelId=Model.ANTHROPIC_CLAUDE_INSTANT.value,
)
print(json.loads(response.get("body").read())["completion"])


## Using LangChain

### Setting up the chain

In [None]:

llm_claude = Bedrock(
    credentials_profile_name="bedrock",
    model_id=Model.ANTHROPIC_CLAUDE_INSTANT.value,
    region_name="us-west-2",
    model_kwargs={
        "max_tokens_to_sample": 500,
        "temperature": 0.1,
        "top_k": 10,
        "top_p": 1.0,
        "stop_sequences": ["\n\nHuman:"]
    },
    cache=False
)
llm_cohere_embed = BedrockEmbeddings(
    credentials_profile_name="bedrock",
    model_id=Model.COHERE_EMBED_ENGLISH.value,
    region_name="us-west-2",
)

parser = RegexParser(regex=r"(?s)<answer>(.*)</answer>", output_keys=["answer"])

prompt = PromptTemplate(
    template="""
You act as an AWS Cloud Practitioner and only answer questions about AWS. Read the user’s question
supplied within the <question> tags. Then, use the contextual information provided above within the
<context> tags to provide an answer in <answer> tag. Do not repeat the context. Respond that you
don't know if you don't have enough information to answer.

<context>
{context}
</context>

<question>
{question}
</question>
""",
    input_variables=["question", "context"]
)

chain_claude = prompt | llm_claude | parser
chain_cohere_embed = llm_cohere_embed


### Prompt without context

In [None]:

print(chain_claude.invoke({ "question": question, "context": "" })["answer"])


### Prompt with context


In [None]:
context = """The instruction set architecture of a Lambda function determines the type of computer
processor that Lambda uses to run the function. Lambda provides a choice of instruction set
architectures:
    arm64 – 64-bit ARM architecture, for the AWS Graviton2 processor.
    x86_64 – 64-bit x86 architecture, for x86-based processors.
"""

print(chain_claude.invoke({ "question": question, "context": context })["answer"])


### Prompt with Retrieval-augmented generation (RAG)

A typical RAG application has two main components:

1. Indexing
  * Load: First we need to load our data. We'll use DocumentLoaders for this.
  * Split: Text splitters break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won't in a model's finite context window.
  * Store: We need somewhere to store and index our splits, so that they can later be searched over. This is often done using a VectorStore and Embeddings model.

2. Retrieval and generation
  * Retrieve: Given a user input, relevant splits are retrieved from storage using a Retriever.  
  * Generate: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data

In [None]:
# Indexing - load
package_root = subprocess.run(["brazil-context", "package", "root"], capture_output=True).stdout.decode().strip()
loader = TextLoader(f"{package_root}/example_data/lambda_deploy_to_multiple_architectures.txt")
data = loader.load()

# Indexing - split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

# Indexing - store
embeddings = chain_cohere_embed.embed_documents(list(map(lambda x: x.page_content, all_splits)))

# Indexing - store: storing embeddings
vector_db = lancedb.connect("/tmp/lancedb")
data = [{
    "vector": embeddings[i],
    "text": all_splits[i].page_content,
    "id": i,
} for i in range(len(all_splits))]
vector_table = vector_db.create_table("rag-data", data=data, mode="overwrite")

# Querying - retrieve context
question_embedding = chain_cohere_embed.embed_query(question)
relevant_chunks = vector_table.search(question_embedding)
relevant_chunk_ids = [chunk["id"] for chunk in relevant_chunks.to_list()]

# Querying - generate content
context = "\n".join([
    all_splits[int(split_id)].page_content
    for split_id in relevant_chunk_ids
])
print(chain_claude.invoke({ "question": question, "context": context })["answer"])


# Evaluation

Evaluating Claude 3 Haiku, Jurassic Mid, and Titan Express models on the ability to summarize texts. The dataset being used is 
[cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) but we should be able to swap it out to 
a custom dataset -- just load your own data to the `articles` and `ground_truths` lists. 

For each model, we compare their responses with the ground truth to generate ROUGEL score and Cosine 
Similarity metrics. We use Claude-as-a-judge to assess the accuracy, coherence, factuality, and 
completeness of the summary.

## Preparing evaluation datasets

In [None]:
evaluation_dataset_size = 10

dataset = load_dataset("cnn_dailymail", "3.0.0")

evaluation_dataset = list(dataset["train"])[:evaluation_dataset_size]
articles = [entry["article"] for entry in evaluation_dataset]
ground_truths = [entry["highlights"] for entry in evaluation_dataset]


## Setting up prompts and chains

In [None]:
prompt = PromptTemplate.from_template("""
Summarize the content that is wrapped in <article> tag. Output the summary within a <summary> tag.

<article>
{text}
</article>
"""
)

judge_prompt = PromptTemplate.from_template("""
You will be given a summary of a news article. Your task is to evaluate the summary in four dimensions:
accuracy, coherence, factuality, and completeness. Provide a score of 1-5 in each dimension, with 5
being the best score. If the summary does not contain any useful summarization, give it a score of 0.

Original article: {article}

Summary: {summarization}

Write the scores to the corresponding tags as below:

<coherence></coherence>
<accuracy></accuracy>
<factuality></factuality>
<completeness></factuality>
""")

# Claude 3 uses Messages API instead of Text Completion API, so it requires a different chain
# (ChatBedrock instead of Bedrock)
llm_claude = ChatBedrock(
    credentials_profile_name="bedrock",
    model_id=Model.ANTHROPIC_CLAUDE_3_HAIKU.value,
    region_name="us-west-2",
    model_kwargs={
        "max_tokens": 4096,
        "temperature": 0.1,
        "top_k": 10,
        "top_p": 1.0,
    },
    cache=False
)

llm_jurassic = Bedrock(
    credentials_profile_name="bedrock",
    model_id=Model.AI21_JURASSIC_MID.value,
    region_name="us-west-2",
    model_kwargs={
        "maxTokens": 4096,
        "temperature": 0.1,
        "countPenalty": {"scale": 0},
        "presencePenalty": {"scale": 0},
        "frequencyPenalty": {"scale": 0},
    },
    cache=False
)

llm_titan = Bedrock(
    credentials_profile_name="bedrock",
    model_id=Model.AMAZON_TITAN_TEXT_EXPRESS.value,
    region_name="us-west-2",
    model_kwargs={
        "maxTokenCount": 4096,
        "temperature": 0.1,
    },
    cache=False
)


parser = RegexParser(regex=r"(?s)<summary>(.*)</summary>", output_keys=["summary"])
chain_claude = prompt | llm_claude | parser
chain_jurassic = prompt | llm_jurassic

chain_titan = prompt | llm_titan

evaluation_parser = RegexParser(
    regex=r"(?s)<coherence>(.*?)</coherence>.*<accuracy>(.*?)</accuracy>.*<factuality>(.*?)</factuality>.*<completeness>(.*?)</completeness>",
    output_keys=["coherence", "accuracy", "factuality", "completeness"],
)
chain_judge_claude = judge_prompt | llm_claude | evaluation_parser


## Inferencing (querying models)

In [None]:
results = []

progress = IntProgress(min=0, max=len(articles))
progress.value = 0
display(progress)
for idx, article in enumerate(articles):
    # Generate summaries
    claude_summary = chain_claude.invoke({"text": article})["summary"]
    jurassic_summary = chain_jurassic.invoke({"text": article})
    titan_summary = chain_titan.invoke({"text": article})

    ground_truth = ground_truths[idx]

    result = [
        article,
        ground_truth,
        claude_summary,
        jurassic_summary,
        titan_summary,
    ]

    results.append(result)

    progress.value += 1


## Scoring

In [None]:
progress = IntProgress(min=0, max=len(articles))
progress.value = 0
display(progress)

for result in results:
    article, ground_truth, claude_summary, jurassic_summary, titan_summary = result

    claude_scores = chain_judge_claude.invoke({"article": article, "summarization": claude_summary})
    jurassic_scores = chain_judge_claude.invoke({"article": article, "summarization": jurassic_summary})
    titan_scores = chain_judge_claude.invoke({"article": article, "summarization": titan_summary})
    result.extend([
        int(claude_scores['accuracy']),
        int(claude_scores['coherence']),
        int(claude_scores['factuality']),
        int(claude_scores['completeness']),
    ])
    result.append(cosine_similarity(ground_truth, claude_summary))
    result.append(rouge_score("rougeL", ground_truth, claude_summary))

    result.extend([
        int(jurassic_scores['accuracy']),
        int(jurassic_scores['coherence']),
        int(jurassic_scores['factuality']),
        int(jurassic_scores['completeness']),
    ])
    result.append(cosine_similarity(ground_truth, jurassic_summary))
    result.append(rouge_score("rougeL", ground_truth, jurassic_summary))

    result.extend([
        int(titan_scores['accuracy']),
        int(titan_scores['coherence']),
        int(titan_scores['factuality']),
        int(titan_scores['completeness']),
    ])
    result.append(cosine_similarity(ground_truth, titan_summary))
    result.append(rouge_score("rougeL", ground_truth, titan_summary))

    progress.value += 1


df = pd.DataFrame(results)
df.columns = [
    'article',
    'ground_truth',
    'claude_summary',
    'jurassic_summary',
    'titan_summary',

    'claude_accuracy',
    'claude_coherence',
    'claude_factuality',
    'claude_completeness',
    'claude_cosine_similarity',
    'claude_rouge_l',

    'jurassic_accuracy',
    'jurassic_coherence',
    'jurassic_factuality',
    'jurassic_completeness',
    'jurassic_cosine_similarity',
    'jurassic_rouge_l',

    'titan_accuracy',
    'titan_coherence',
    'titan_factuality',
    'titan_completeness',
    'titan_cosine_similarity',
    'titan_rouge_l',
]


### Use Flock for scoring

Read https://builderhub.corp.amazon.com/docs/gen-ai/golden-path-genai-model-evaluation.html#use-flock-for-evaluation-task-locally-in-your-development-environment for more details.

In [None]:

from flock_eval.evaluation import GroundTruthTextBasedEvaluator
from flock_eval.logging import logger
from flock_eval.testsuite import TextTestCase
from flock_eval.similarity.config import BedrockMetricConfig, MetricConfig, MetricDefinitions

# See evaluation metrics definitions in https://code.amazon.com/packages/FlockEval/blobs/mainline/--/doc/evaluation.md
metrics = MetricDefinitions(
    # Flock uses 'rougeLsum' metric. ROUGE-LSum is a specialized version of ROUGE-L, fine-tuned to better evaluate the 
    # quality of summaries by considering the unique characteristics of summarization.
    # 1. Evaluates at the level of sentences instead of the entire document.
    # 2. Captures the main ideas in a condensed form is more critical than preserving the entire structure of the original text.
    rouge=MetricConfig(enabled=True),
    
    # This metric is computed by invoking an LLM to find out the similarity between two texts. 
    llm=BedrockMetricConfig(enabled=True, bedrock_model_id=Model.ANTHROPIC_CLAUDE_3_HAIKU.value),
    
    # The LLM_Correctness metric allows the LLM to indicate whether or not it thinks the system response is better or 
    # worse than the provided ground truth response, in terms of correctness.
    llm_correctness=BedrockMetricConfig(enabled=True, bedrock_model_id=Model.ANTHROPIC_CLAUDE_3_HAIKU.value),
    
    # A very basic similarity check that checks for an exact match only. It is not suitable for our use case.
    binary=MetricConfig(enabled=False),
    
    # Typically used for machine translation evaluation. It accounts for rephrasing by using traditional NLP techniques 
    # such as stemming. It also includes a penalty for differences in word order between the candidate and the 
    # reference. This helps ensure that translations are not only accurate in terms of content but also in their 
    # syntactic structure.
    # Flock does not support latest NLTK METEOR metric. Tracking in https://t.corp.amazon.com/P139792093. 
    meteor=MetricConfig(enabled=False),
    
    # Computes semantic similarity (cosine similarity between embedding vectors). This distance may be better suited to 
    # sentences. A longer text may be truncated.
    semantic=BedrockMetricConfig(enabled=True, bedrock_model_id=Model.AMAZON_TITAN_EMBED_TEXT.value),
)

evaluator = GroundTruthTextBasedEvaluator(metrics)

# Update the evaluator to use bedrock runtime client created with 'bedrock' credential profile
rating_suite = evaluator.rating_suite(metrics)
rating_suite._enabled_metrics['llm'].llm.client = bedrock_runtime
rating_suite._enabled_metrics['llm_correctness'].llm.client = bedrock_runtime
rating_suite._enabled_metrics['semantic'].embedding_model.client = bedrock_runtime

def evaluate_with_flock(inference_df, prediction_column_name):
    dataset_in_flock_format = inference_df[['article', 'ground_truth', prediction_column_name]]\
        .rename(columns={
            'article': 'input', 
            'ground_truth': 'ground_truth_solution', 
            prediction_column_name: 'system_solution'})\
        .apply(pd.Series.to_dict, axis='columns')\
        .apply(TextTestCase.from_json)
    eval_result = [
        evaluator.evaluate_test_case(
            rating_suite, 
            test_case, 
            logger.bind(test_case_id=index)
        ) for index, test_case in enumerate(dataset_in_flock_format)
    ]
    metrics = [r.metrics for r in eval_result]
    return pd.DataFrame.from_records(metrics)\
        .add_prefix(f'{prediction_column_name}_', axis='columns')
    
flock_claude_result_df = evaluate_with_flock(df, 'claude_summary')
flock_jurassic_result_df = evaluate_with_flock(df, 'jurassic_summary')
flock_titan_result_df = evaluate_with_flock(df, 'titan_summary')
df = pd.concat([df, flock_claude_result_df, flock_jurassic_result_df, flock_titan_result_df], axis='columns')

## Results

### Results - Cosine similarity evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_cosine_similarity", "jurassic_cosine_similarity", "titan_cosine_similarity"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)


In [None]:
df[["claude_cosine_similarity", "jurassic_cosine_similarity", "titan_cosine_similarity"]].describe()


### Results - RougeL evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_rouge_l", "jurassic_rouge_l", "titan_rouge_l"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)


In [None]:
df[["claude_rouge_l", "jurassic_rouge_l", "titan_rouge_l"]].describe()


### Results - LLM-as-a-judge - accuracy evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_accuracy", "jurassic_accuracy", "titan_accuracy"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)


In [None]:
df[["claude_accuracy", "jurassic_accuracy", "titan_accuracy"]].describe()


### Results - LLM-as-a-judge - coherence evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_coherence", "jurassic_coherence", "titan_coherence"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)


In [None]:
df[["claude_coherence", "jurassic_coherence", "titan_coherence"]].describe()


### Results - LLM-as-a-judge - factuality evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_factuality", "jurassic_factuality", "titan_factuality"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)


In [None]:
df[["claude_factuality", "jurassic_factuality", "titan_factuality"]].describe()


### Results - LLM-as-a-judge - completeness evaluation

In [None]:
display(
    HTML(
        df[[
            # "article", "claude_summary", "jurassic_summary",
            "claude_completeness", "jurassic_completeness", "titan_completeness"
        ]].to_html(justify="left")
    )
)


In [None]:
df[["claude_completeness", "jurassic_completeness", "titan_completeness"]].describe()

## Flock evaluation results

### Flock - semantic similarity evaluation

In [None]:
display(
    HTML(
        df[[
            "claude_summary_semantic", "jurassic_summary_semantic", "titan_summary_semantic"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)

In [None]:
df[["claude_summary_semantic", "jurassic_summary_semantic", "titan_summary_semantic"]].describe()

### Flock - RougeL-Sum evaluation

In [None]:
display(
    HTML(
        df[[
            "claude_summary_rouge", "jurassic_summary_rouge", "titan_summary_rouge"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)

In [None]:
df[["claude_summary_rouge", "jurassic_summary_rouge", "titan_summary_rouge"]].describe()

### Flock - LLM-as-a-judge correctness evaluation

In [None]:
display(
    HTML(
        df[[
            "claude_summary_llm_correctness", "jurassic_summary_llm_correctness", "titan_summary_llm_correctness"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)

In [None]:
df[["claude_summary_llm_correctness", "jurassic_summary_llm_correctness", "titan_summary_llm_correctness"]].describe()


### Flock - LLM-as-a-judge similarity evaluation

In [None]:
display(
    HTML(
        df[[
            "claude_summary_llm", "jurassic_summary_llm", "titan_summary_llm"
        ]].to_html(justify="left")
        .replace("\\n", "")
        .replace("<td>", '<td style="text-align:left">')
    )
)

In [None]:
df[["claude_summary_llm", "jurassic_summary_llm", "titan_summary_llm"]].describe()

### Model outputs

In [None]:
display(
    HTML(
        df[[
            "article", "claude_summary", "jurassic_summary", "titan_summary",
        ]].to_html(justify="left")
    )
)
