In [1]:
from ragas.testset.graph import Node

sample_nodes = [Node(
    properties={"page_content": "Einstein's theory of relativity revolutionized our understanding of space and time. It introduced the concept that time is not absolute but can change depending on the observer's frame of reference."}
),Node(
    properties={"page_content": "Time dilation occurs when an object moves close to the speed of light, causing time to pass slower relative to a stationary observer. This phenomenon is a key prediction of Einstein's special theory of relativity."}
)]
sample_nodes

  from .autonotebook import tqdm as notebook_tqdm


[Node(id: 132ba7, type: NodeType.UNKNOWN, properties: ['page_content']),
 Node(id: e08b68, type: NodeType.UNKNOWN, properties: ['page_content'])]

In [4]:
sample_nodes

[Node(id: 132ba7, type: NodeType.UNKNOWN, properties: ['page_content']),
 Node(id: e08b68, type: NodeType.UNKNOWN, properties: ['page_content'])]

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
from ragas.testset.transforms import extractors

dir(extractors)

['EmbeddingExtractor',
 'HeadlinesExtractor',
 'KeyphrasesExtractor',
 'NERExtractor',
 'SummaryExtractor',
 'TitleExtractor',
 'TopicDescriptionExtractor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'emails_extractor',
 'embeddings',
 'links_extractor',
 'llm_based',
 'markdown_headings_extractor',
 'regex_based']

In [17]:
from ragas.testset.transforms.extractors import TopicDescriptionExtractor, NERExtractor

extractor = NERExtractor()
output = [await extractor.extract(node) for node in sample_nodes]
output

[('entities',
  ['Einstein',
   'theory of relativity',
   'space',
   'time',
   "observer's frame of reference"]),
 ('entities',
  ['Time dilation',
   'speed of light',
   'stationary observer',
   'Einstein',
   'special theory of relativity'])]

In [16]:
extractor = TopicDescriptionExtractor()
output_topic = [await extractor.extract(node) for node in sample_nodes]
output_topic

[('topic_description',
  "A discussion on Einstein's theory of relativity and its impact on the understanding of space and time, emphasizing the relativity of time based on the observer's frame of reference."),
 ('topic_description',
  "An explanation of time dilation, a phenomenon predicted by Einstein's special theory of relativity, where time passes slower for objects moving close to the speed of light compared to stationary observers.")]

In [18]:
_ = [node.properties.update({key:val}) for (key,val), node in zip(output, sample_nodes)]
sample_nodes[0].properties

{'page_content': "Einstein's theory of relativity revolutionized our understanding of space and time. It introduced the concept that time is not absolute but can change depending on the observer's frame of reference.",
 'entities': ['Einstein',
  'theory of relativity',
  'space',
  'time',
  "observer's frame of reference"]}

In [19]:
_ = [node.properties.update({key:val}) for (key,val), node in zip(output_topic, sample_nodes)]
sample_nodes[0].properties

{'page_content': "Einstein's theory of relativity revolutionized our understanding of space and time. It introduced the concept that time is not absolute but can change depending on the observer's frame of reference.",
 'entities': ['Einstein',
  'theory of relativity',
  'space',
  'time',
  "observer's frame of reference"],
 'topic_description': "A discussion on Einstein's theory of relativity and its impact on the understanding of space and time, emphasizing the relativity of time based on the observer's frame of reference."}

In [21]:
from ragas.testset.transforms.relationship_builders import traditional

dir(traditional)

['Counter',
 'DistanceMeasure',
 'JaccardSimilarityBuilder',
 'KnowledgeGraph',
 'Node',
 'OverlapScoreBuilder',
 'Relationship',
 'RelationshipBuilder',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'dataclass',
 't']

# Jaccard Similarity Measure

The **Jaccard Similarity Measure**—also known as the **Jaccard Index** or **Jaccard Coefficient**—is a statistic used to quantify the similarity and diversity between two finite sets. It is defined as the size of the intersection divided by the size of the union of the sets:

$$
    J(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$

where:
- $ |A \cap B| $ is the number of elements common to both sets $A$ and $B$.
- $ |A \cup B| $ is the total number of elements in either set (i.e., all unique elements from both).

The resulting value is always between 0 and 1:
- **0** indicates that the sets have no elements in common.
- **1** indicates that the sets are identical.

## Interpretation

The Jaccard Similarity is particularly useful when:
- Comparing binary data (e.g., presence/absence in text mining or market basket analysis).
- Evaluating similarity between documents, images (via object detection IoU), or user behaviors.
- Dealing with sparse data where the absence of an element in both sets is less informative than its presence in at least one.

## Example

Consider the sets:
- $ A = \{1, 2, 3, 5\} $
- $ B = \{2, 3, 4, 5, 6\} $

Their intersection and union are:
- $ A \cap B = \{2, 3, 5\} $
- $ A \cup B = \{1, 2, 3, 4, 5, 6\} $

Thus, the Jaccard Similarity is:

$$
J(A, B) = \frac{3}{6} = 0.5
$$

In [34]:
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.transforms.relationship_builders.traditional import JaccardSimilarityBuilder

kg = KnowledgeGraph(nodes=sample_nodes)
rel_builder = JaccardSimilarityBuilder(property_name="entities", new_property_name="entity_jaccard_similarity")
relationships = await rel_builder.transform(kg)
relationships

[]

In [38]:
rel_builder_topic = JaccardSimilarityBuilder(property_name="topic_description", new_property_name="entity_jaccard_similarity")
relationships_topic = await rel_builder.transform(kg)
relationships_topic

[Relationship(Node(id: 132ba7) <-> Node(id: e08b68), type: jaccard_similarity, properties: ['entity_jaccard_similarity'])]

In [39]:
from ragas.testset.transforms import apply_transforms
transforms = [
    extractor,
    rel_builder
    ]

apply_transforms(kg,transforms)

Applying NERExtractor:   0%|          | 0/2 [00:00<?, ?it/s]Property 'entities' already exists in node '132ba7'. Skipping!
Applying NERExtractor:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Property 'entities' already exists in node 'e08b68'. Skipping!
                                                                        

In [40]:
transforms_topic = [
    TopicDescriptionExtractor(),
    rel_builder_topic
    ]

apply_transforms(kg,transforms)

Applying NERExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Property 'entities' already exists in node '132ba7'. Skipping!
Applying NERExtractor:  50%|█████     | 1/2 [00:00<00:00,  1.02it/s]Property 'entities' already exists in node 'e08b68'. Skipping!
                                                                        

In [44]:
from ragas.testset.transforms import apply_transforms, Parallel

tranforms = [
    Parallel(
        TopicDescriptionExtractor(),
        NERExtractor()
    ),
    rel_builder
]

apply_transforms(kg,transforms)

Applying NERExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Property 'entities' already exists in node 'e08b68'. Skipping!
Applying NERExtractor:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Property 'entities' already exists in node '132ba7'. Skipping!
                                                                        

In [53]:
relationships_topic

[Relationship(Node(id: 132ba7) <-> Node(id: e08b68), type: jaccard_similarity, properties: ['entity_jaccard_similarity'])]

In [52]:
relationships_topic[0].properties

{'entity_jaccard_similarity': 0.8333333333333334}

In [60]:
kg

KnowledgeGraph(nodes: 2, relationships: 5)

In [78]:
# %%
from dataclasses import dataclass
# Import necessary types from ragas
# Correct the import path for the base class
from ragas.testset.synthesizers.base import BaseSynthesizer
from ragas.testset.synthesizers.testset_schema import SingleTurnSample

@dataclass
class EntityQuerySynthesizer(BaseSynthesizer):
    # ... rest of the class definition remains the same ...

    async def _generate_scenarios( self, n, knowledge_graph, callbacks):
        """
        logic to query nodes with entity
        logic describing how to combine nodes,styles,length,persona to form n scenarios
        """
        # Placeholder: Replace with your actual scenario generation logic
        print(f"Generating {n} scenarios using knowledge_graph...")
        # Example: scenarios = [...]
        scenarios = [{"type": "entity_query", "details": f"Scenario {i+1}"} for i in range(n)]
        return scenarios

    async def _generate_sample(
        self, scenario, callbacks
    ):
        """
        logic on how to use tranform each scenario to EvalSample (Query,Context,Reference)
        you may create singleturn or multiturn sample
        """
        # Placeholder: Replace with your actual sample generation logic based on the scenario
        print(f"Generating sample for scenario: {scenario['details']}")
        # Example: query = "...", contexts = [...], reference = "..."
        query = f"What can you tell me about {scenario.get('details', 'default entity')}?"
        contexts = [f"Context related to {scenario.get('details', 'default entity')}."]
        reference = f"Reference answer about {scenario.get('details', 'default entity')}."

        return SingleTurnSample(user_input=query, reference_contexts=contexts, reference=reference)

In [79]:
# %%
# Example usage (you'll need to adapt this)
synthesizer = EntityQuerySynthesizer()
scenarios = await synthesizer._generate_scenarios(n=5, knowledge_graph=kg, callbacks=None)
for scenario in scenarios:
    sample = await synthesizer._generate_sample(scenario=scenario, callbacks=None)
    print(sample)

Generating 5 scenarios using knowledge_graph...
Generating sample for scenario: Scenario 1
user_input='What can you tell me about Scenario 1?' retrieved_contexts=None reference_contexts=['Context related to Scenario 1.'] response=None multi_responses=None reference='Reference answer about Scenario 1.' rubrics=None
Generating sample for scenario: Scenario 2
user_input='What can you tell me about Scenario 2?' retrieved_contexts=None reference_contexts=['Context related to Scenario 2.'] response=None multi_responses=None reference='Reference answer about Scenario 2.' rubrics=None
Generating sample for scenario: Scenario 3
user_input='What can you tell me about Scenario 3?' retrieved_contexts=None reference_contexts=['Context related to Scenario 3.'] response=None multi_responses=None reference='Reference answer about Scenario 3.' rubrics=None
Generating sample for scenario: Scenario 4
user_input='What can you tell me about Scenario 4?' retrieved_contexts=None reference_contexts=['Context 

In [80]:
scenarios

[{'type': 'entity_query', 'details': 'Scenario 1'},
 {'type': 'entity_query', 'details': 'Scenario 2'},
 {'type': 'entity_query', 'details': 'Scenario 3'},
 {'type': 'entity_query', 'details': 'Scenario 4'},
 {'type': 'entity_query', 'details': 'Scenario 5'}]