In [8]:
from re import RegexFlag

from dotenv import load_dotenv
import os
from anthropic import Anthropic

load_dotenv()

client = Anthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY")
)


def get_messages(prompt):
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=2048,
        temperature=1,
    )
    return response.content[0].text

In [9]:
import re

from pydantic import BaseModel
from typing import Literal


class GenericReview(BaseModel):
    review: str
    sentiment: Literal["positive", "neutral", "negative"]


def generate_review(input, sentiment) -> GenericReview:
    prompt = f"""
You are a helpful assistant that generates a review based on the input.
generate a review based on the input with a sentiment of {sentiment}.

{input}

the generated review should be enclosed in <review> tags.
"""

    response = get_messages(prompt)
    review = re.search(r"<review>(.*?)</review>", response, RegexFlag.DOTALL).group(1).strip()

    return GenericReview(
        review=review,
        sentiment=sentiment
    )

generate_review("The product is great!", "positive")

GenericReview(review="I absolutely love this product! It's truly exceptional in every way. The quality is outstanding, and it perfectly meets all my needs. I'm thoroughly impressed by how great it is and can't imagine my life without it now. If you're considering buying it, don't hesitate – you won't be disappointed. This product has exceeded all my expectations and then some. It's a game-changer that I wholeheartedly recommend to everyone!", sentiment='positive')

In [10]:
input = "The product is great!"
samples = [
    review
    for sentiment in ["positive", "neutral", "negative"]
    for _ in range(3)
    if (review := generate_review(input, sentiment)) is not None
]

samples

[GenericReview(review="I absolutely love this product! It's truly fantastic and exceeds all my expectations. The quality is outstanding, and it performs brilliantly in every aspect. I can't say enough good things about how great it is. It's a game-changer that has made a significant positive impact on my daily life. If you're considering buying it, don't hesitate – you won't be disappointed. This product is definitely worth every penny and then some. I'm thoroughly impressed and couldn't be happier with my purchase!", sentiment='positive'),
 GenericReview(review="tags:\n\n<review>\nI absolutely love this product! It's truly outstanding in every way. The quality is top-notch, and it performs exactly as advertised, if not better. From the moment I started using it, I knew I had made an excellent choice. It's user-friendly, efficient, and has exceeded all my expectations. The design is sleek and modern, making it a pleasure to use. I've already recommended it to friends and family because

In [11]:
def predict_sentiment(review: str, examples: list[GenericReview]) -> Literal["positive", "neutral", "negative"]:
    example_str = "\n\n".join([f"Review_{i+1}: {r.review}\nSentiment_{i+1}: {r.sentiment}" for i, r in enumerate(examples)])

    prompt = f"""
You are a helpful assistant that predicts the sentiment of a review.
the sentiment of the review is one of following: positive, neutral, negative.
the sentiment of the review should be enclosed in <sentiment> tags.

<example>
{example_str}
</example>

<review>
{review}
</review>

What it is the sentiment of the review?
"""

    response = get_messages(prompt)
    sentiment = re.search(r"<sentiment>(.*?)</sentiment>", response, RegexFlag.DOTALL).group(1).strip()
    return sentiment

In [12]:
%pip install sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/05/89/7eb147a37b7f31d3c815543df539d8b8d0425e93296c875cc87719d65232/sentence_transformers-3.4.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/7b/9f/92d3091c44cb19add044064af1bf1345cd35fbb84d32a3690f912800a295/transformers-4.48.1-py3-none-any.whl.metadata
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from sentence-transformers)
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(review: str) -> list[float]:
    return model.encode(review)


sentences = [
    "I love this product!",
    "I live in a city full of bananas!",
    "The product is great!",
    "The product is terrible!",
    "Baskets are full of bananas!",
]

embedding = get_embedding(sentences)

pairwise = model.similarity(embedding, embedding)

print("유사도 매트릭스:")
print(pairwise)
print("\n유사도 벡터:")
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        print(f"{sentences[i]}: {sentences[j]}: {pairwise[i, j]:.4f}")


유사도 매트릭스:
tensor([[1.0000, 0.2286, 0.7937, 0.5736, 0.1795],
        [0.2286, 1.0000, 0.2366, 0.2771, 0.6944],
        [0.7937, 0.2366, 1.0000, 0.6872, 0.1704],
        [0.5736, 0.2771, 0.6872, 1.0000, 0.2120],
        [0.1795, 0.6944, 0.1704, 0.2120, 1.0000]])

유사도 벡터:
I love this product!: I live in a city full of bananas!: 0.2286
I love this product!: The product is great!: 0.7937
I love this product!: The product is terrible!: 0.5736
I love this product!: Baskets are full of bananas!: 0.1795
I live in a city full of bananas!: The product is great!: 0.2366
I live in a city full of bananas!: The product is terrible!: 0.2771
I live in a city full of bananas!: Baskets are full of bananas!: 0.6944
The product is great!: The product is terrible!: 0.6872
The product is great!: Baskets are full of bananas!: 0.1704
The product is terrible!: Baskets are full of bananas!: 0.2120


In [16]:
import math

def cosine_distance(a: list[float], b: list[float]) -> float:
    return 1 - sum(ai * bi for ai, bi in zip(a, b)) / (math.sqrt(sum(ai**2 for ai in a)) * math.sqrt(sum(bi**2 for bi in b)))

def embed_queries(queries: list[str], model: SentenceTransformer) -> list[tuple[str, list[float]]]:
    return [
        (query, model.encode(query).tolist())
        for query in queries
    ]

In [17]:
embedded_queries = embed_queries(sentences, model)

dist = cosine_distance(embedded_queries[0][1], embedded_queries[1][1])

print(sentences[0], sentences[1], f"유사도: {dist:.4f}")

I love this product! I live in a city full of bananas! 유사도: 0.7714


In [19]:
examples = [
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "What is the capital of Germany?", "answer": "Berlin"},
    {"question": "What is the best programming language?", "answer": "Java"},
    {"question": "What is the capital of Italy?", "answer": "Rome"},
]

query = "What is the capital of Korea?"

embeddings = embed_queries([example["question"] for example in examples] + [query], model)


In [22]:
def knn(
        embedded_queries: list[tuple[str, list[float]]],
        query_embedding: list[float],
        examples: list[dict[str, str]],
        k: int = 3,
) -> list[tuple[str, float]]:
    distances = [
        (cosine_distance(query_embedding, embed), query, examples[i]["answer"])
        for i, (query, embed) in enumerate(embedded_queries)
    ]

    return sorted(distances, key=lambda x: x[0])[:k]

knn(embedded_queries=embeddings[:-1],
    query_embedding=embeddings[-1][1],
    examples=examples)

[(0.4342349895687475, 'What is the capital of Germany?', 'Berlin'),
 (0.47199984447232124, 'What is the capital of Italy?', 'Rome'),
 (0.50319621473308, 'What is the capital of France?', 'Paris')]

In [25]:
def format_examples(examples: list[tuple[str, float]]) -> str:
    return "\n\n".join([f"Question_{i+1}: {e[1]}\nAnswer_{i+1}: {e[2]}" for i, e in enumerate(examples)])

format_examples(knn(embedded_queries=embeddings[:-1], query_embedding=embeddings[-1][1], examples=examples))

'Question_1: What is the capital of Germany?\nAnswer_1: Berlin\n\nQuestion_2: What is the capital of Italy?\nAnswer_2: Rome\n\nQuestion_3: What is the capital of France?\nAnswer_3: Paris'

In [27]:
def knn_response(question: str, examples: list[dict[str, str]]) -> str:
    embeddings = embed_queries([e["question"] for e in examples] + [question], model)
    k_closest = knn(embedded_queries=embeddings[:-1], query_embedding=embeddings[-1][1], examples=examples)
    example_str = format_examples(k_closest)

    prompt = f"""
    You are a helpful assistant that generates a response based on the question and examples.
    Below are examples of questions and answers within the examples tag.
    Using the examples as a reference, generate a response based on the question.
    Please answer the question contained in the question tag.

    <examples>
    {example_str}
    </examples>

    <question>
    {question}
    </question>
    """

    response = get_messages(prompt)
    return response

knn_response("What is the popular programming language in 2024", examples)

"Based on the examples provided and the current question, I'll generate a response in a similar format:\n\nPython\n\nWhile the examples don't directly address the popularity of programming languages in 2024, I've provided a concise answer that follows the pattern of the given examples. Python is widely considered one of the most popular programming languages in recent years and is likely to maintain its popularity in 2024 due to its versatility, ease of use, and strong presence in fields like data science, machine learning, and web development."