In [2]:
import openai
import os

In [3]:
from abc import ABC, abstractmethod

class EmbeddingModel(ABC):
    @abstractmethod
    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        """Generate embedding for the given texts."""
        pass

In [4]:
class OpenAIEmbeddingModel(EmbeddingModel):
    def __init__(self, model_name: openai.types.EmbeddingModel, api_key: str | None = os.getenv("OPENAI_API_KEY")) -> None:
        self.model_name = model_name
        self.api_key = api_key 
        self.client = openai.OpenAI(api_key=self.api_key)

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        if not texts:
            return []
        response = self.client.embeddings.create(
            model=self.model_name,
            input=texts
        )
        embeddings = [data.embedding for data in response.data]
        return embeddings

In [5]:
def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
    """Compute the cosine similarity between two vectors."""
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = sum(a * a for a in vec1) ** 0.5
    magnitude2 = sum(b * b for b in vec2) ** 0.5
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    return dot_product / (magnitude1 * magnitude2)

In [None]:
# Test negation handling
embedding_model = OpenAIEmbeddingModel(model_name="text-embedding-3-small")

sentences = [
    "Techvify is created in 2019.",
    "Techvify is not created in 2019.",
    "Techvify is created in 2025."
]

embedding = embedding_model.embed_texts(sentences)

A = embedding[0]
A_neg = embedding[1]
B = embedding[2]

sim_A_B = cosine_similarity(A, B)
sim_Aneg_B = cosine_similarity(A_neg, B)

print(f"Cosine similarity between A and B: {sim_A_B}")
print(f"Cosine similarity between A_neg and B: {sim_Aneg_B}")

if sim_A_B > sim_Aneg_B:
    print("The embedding model correctly captures negation.")
else:
    print("The embedding model fails to capture negation.")

In [None]:
import timeit

def is_opposite(text_a: str, text_b: str) -> bool:
    """Determine if two sentences are opposites using OpenAI's chat completion."""
    import pydantic

    class Classification(pydantic.BaseModel):
        is_opposite: bool
        is_irrelevant: bool

    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    completion = client.chat.completions.parse(
        model="gpt-4.1-nano",
        messages=[
            {"role": "user", "content": f"Are these two text opposites in meaning?\n1. {text_a}\n2. {text_b}"}
        ],
        response_format=Classification,
        max_tokens=100,
        temperature=0.0
    )

    response = completion.choices[0].message.parsed
    usage = completion.usage
    print(f"Token usage: {usage}")
    if not isinstance(response, Classification):
        raise ValueError("Unexpected response format from OpenAI API.")
    return response.is_opposite

# Example usage of is_opposite function
text_a = "Techvify duoc thanh lap vao nam 2019."
text_b = "Techvify khong duoc thanh lap vao nam 2019."
start_time = timeit.default_timer()
result = is_opposite(text_a, text_b)
end_time = timeit.default_timer()
print(f"Are the sentences '{text_a}' and '{text_b}' opposites? {result}")
print(f"Time taken: {end_time - start_time} seconds")

Token usage: CompletionUsage(completion_tokens=14, prompt_tokens=105, total_tokens=119, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))
Are the sentences 'Techvify duoc thanh lap vao nam 2019.' and 'Techvify khong duoc thanh lap vao nam 2019.' opposites? True
Time taken: 5.789373511999656 seconds


In [7]:
# Test numerical difference
import re

NumberLike = int | float

def extract_numbers(text: str) -> list[NumberLike] | None:
    """Extract numbers from a given text."""
    number_strings = re.findall(r'\d+\.?\d*', text)
    if not number_strings:
        return None
    numbers = [float(num) if '.' in num else int(num) for num in number_strings]
    return numbers

sentences_num = [
    "Techvify is a software company founded in 2019. It has 200 employees and a revenue of 10 million USD.",
    "Techvify is a software company founded in 2021. It has 250 employees and a revenue of 12 million USD.",
    "Techvify is an auditing company founded in 2019. It has 200 employees and a revenue of 10 million USD.",
    "Techvify is an auditing company founded in 2030. It has 500 employees and a revenue of 50 million USD.",
    # Rephrased version to test robustness
    "In 2019, Techvify was established as a software firm with a workforce of 200 and generated revenues amounting to 10 million USD.",
    "In 2021, Techvify was established as a software firm with a workforce of 250 and generated revenues amounting to 12 million USD.",
    "In 2019, Techvify was established as an auditing firm with a workforce of 200 and generated revenues amounting to 10 million USD.",
]

embedding_num = embedding_model.embed_texts(sentences_num)

ground_truth = embedding_num[0]
numerical_different_y = embedding_num[1]
factually_different_y = embedding_num[2]
factual_and_numerical_different_y = embedding_num[3]

rephrase_numerical = embedding_num[4]
rephrase_factual = embedding_num[5]
rephrase_both = embedding_num[6]

def absolute_difference(text1: str, text2: str) -> float:
    nums1 = extract_numbers(text1)
    nums2 = extract_numbers(text2)
    embeddings = embedding_model.embed_texts([text1, text2])
    sim = cosine_similarity(embeddings[0], embeddings[1])
    if not nums1 or not nums2:
        return sim
    diff = sum(abs(a - b) for a, b in zip(nums1, nums2))
    return sim / (1 + diff)  # Penalize similarity by absolute difference

sim_ground_numerical = absolute_difference(sentences_num[0], sentences_num[1])
sim_ground_factually = absolute_difference(sentences_num[0], sentences_num[2])
sim_ground_both = absolute_difference(sentences_num[0], sentences_num[3])
sim_rephrase_numerical = absolute_difference(sentences_num[0], sentences_num[4])
sim_rephrase_factually = absolute_difference(sentences_num[0], sentences_num[5])
sim_rephrase_both = absolute_difference(sentences_num[0], sentences_num[6])

print(f"Similarity (ground vs numerical different): {sim_ground_numerical}")
print(f"Similarity (ground vs factually different): {sim_ground_factually}")
print(f"Similarity (ground vs both different): {sim_ground_both}")
print(f"Similarity (ground vs rephrased numerical different): {sim_rephrase_numerical}")
print(f"Similarity (ground vs rephrased factually different): {sim_rephrase_factually}")
print(f"Similarity (ground vs rephrased both different): {sim_rephrase_both}")

Similarity (ground vs numerical different): 0.01769427443252656
Similarity (ground vs factually different): 0.9181386012136104
Similarity (ground vs both different): 0.002465423564789788
Similarity (ground vs rephrased numerical different): 0.8375647165885765
Similarity (ground vs rephrased factually different): 0.014765807284835194
Similarity (ground vs rephrased both different): 0.7492907358704025
