In [None]:
import sys
print(sys.executable)  # just to confirm it's /home/shashank/venvs/gpu-env/bin/python

!{sys.executable} -m pip install -U spacy
# !{sys.executable} -m spacy download en_core_web_lg # Not required since using gemini
# !{sys.executable} -m spacy download en_core_web_trf # Not required since using gemini
# !{sys.executable} -m pip install -U spacy-transformers
# !{sys.executable} -m pip install -U "spacy[curated-transformers]"
!{sys.executable} -m pip install -U python-dotenv

# Gemini 
!{sys.executable} -m pip install -U google-generativeai
!{sys.executable} -m pip install -U google-genai pydantic

# Sentence splitting
!{sys.executable} -m pip install -U wtpsplit

# Used for chunking these sentences into vocabulary
!{sys.executable} -m pip install -U nltk


# For the clustering part (clustering our relation phrases)
!{sys.executable} -m pip install -U sentence-transformers scikit-learn

# For our initial GAE
!{sys.executable} -m pip install -U torch_geometric

!{sys.executable} -m pip install -U faiss-cpu

In [None]:
"""
The passage of the Global Magnitsky Sanctions Bill on July 1st marked a major policy shift. The primary target of this legislation was the sprawling Titan Industries conglomerate. 
As a direct consequence of the bill, Titan Industries was immediately cut off from the international SWIFT banking system, triggering a severe liquidity crisis. This event, the Titan Collapse, began on July 2nd.


Simultaneously, the Sanctions Bill authorized the seizure of assets linked to designated entities. This led to the freezing of over $500M in Offshore Accounts held in the Valeron banking district. 
A secondary market report from July 5th noted that the Titan Collapse coincided perfectly with the Offshore Accounts being frozen, sparking rumors of a coordinated conspiracy.


Facing financial ruin from the Titan Collapse, mid-level executives at the company pivoted to illicit activities. We tracked a massive spike in Black Market Sales of Titan's embargoed 
strategic assets, beginning around July 10th. This illicit activity was the sole mechanism used by the company to generate off-books revenue.


The sudden flood of embargoed goods on the black market triggered an automated alert. On July 15th, the Financial Crimes Unit (FCU) announced a formal investigation. 
The unit's press release stated they were investigating the Titan Collapse and its "financial irregularities," although the FCU was, in reality, responding specifically to the Black Market Sales (B) reports.
"""

In [None]:
sample_text = """
The passage of the Global Magnitsky Sanctions Bill on July 1st marked a major policy shift. The primary target of this legislation was the sprawling Titan Industries conglomerate. 
As a direct consequence of the bill, Titan Industries was immediately cut off from the international SWIFT banking system, triggering a severe liquidity crisis. This event, the Titan Collapse, began on July 2nd.
Simultaneously, the Sanctions Bill authorized the seizure of assets linked to designated entities. This led to the freezing of over $500M in Offshore Accounts held in the Valeron banking district. 
A secondary market report from July 5th noted that the Titan Collapse coincided perfectly with the Offshore Accounts being frozen, sparking rumors of a coordinated conspiracy.
Facing financial ruin from the Titan Collapse, mid-level executives at the company pivoted to illicit activities. We tracked a massive spike in Black Market Sales of Titan's embargoed 
strategic assets, beginning around July 10th. This illicit activity was the sole mechanism used by the company to generate off-books revenue.
The sudden flood of embargoed goods on the black market triggered an automated alert. On July 15th, the Financial Crimes Unit (FCU) announced a formal investigation. 
The unit's press release stated they were investigating the Titan Collapse and its "financial irregularities," although the FCU was, in reality, responding specifically to the Black Market Sales (B) reports.
"""

In [None]:
print("hello")

# 3.2 Learning our correlational matrices $A_w$

### Our correlational matrices $A_w$ = {$W_1$. $W_2$, .... $W_k$}, these are a set of K weighted adjacency matrices.. Each of $W_r \in [0,1]^{N \times N}$ above represents reationships between any of the N nodes and any of the other N nodes for the specific relation r. Hence $|E_r|$ is the number of non-zero entries in $W_r$. 
Note: Here we are just finding the 1-hop correlational links.

### Hence total edges would be |E| = $\Sigma |E_r|$

## 3.2.1 Document Parsing and Global Node identification

In [None]:
from dotenv import load_dotenv # Import the library
load_dotenv()

In [None]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

print("Available models for content generation:")
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

In [None]:
import os
import json
import time
from pydantic import BaseModel, Field
from typing import List
from google.genai import Client, types 


# --- Define the WRAPPER SCHEMA (Used for both calls) ---
class EntityList(BaseModel):
    """The final response structure containing a simple list of entity names."""
    entities: List[str] = Field(description="A list of all extracted named entities/nodes as simple strings.")
# -----------------------------------

# 1) Configure the Client
client = Client(api_key=os.environ["GEMINI_API_KEY"])

# ... (Prompt definition remains the same) ...
prompt = f"""
You are an expert Named Entity Recognition (NER) system.
Your task is to extract **ALL significant named entities and specific expressions** from the passage below.
The entities extracted should include, but are not limited to, standard categories such as **PERSON, ORGANIZATION, LOCATION, DATE, TIME, MONEY, and specific Laws/Events**, but are not limited to these.
Do not categorize the output; return only the extracted text strings.

Your final output MUST be a JSON object that strictly adheres to the provided schema.
The 'entities' list should contain only the exact extracted text strings, nothing else.

Passage:
{sample_text}
"""
# -----------------------------------

# --- ENSEMBLE PARAMETERS AND LOGIC (Same as before) ---
temperatures = [0.2, 0.5, 0.8] 
all_extracted_entities = []

print(f"--- Starting {len(temperatures)} Ensemble Queries with Rate Limit Control ---")

for i, T in enumerate(temperatures):
    config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=EntityList,
        temperature=T, 
    )

    try:
        response = client.models.generate_content(
            model="gemini-2.5-pro", 
            contents=prompt,
            config=config
        )
        raw_json = json.loads(response.text)
        current_entities = raw_json.get("entities", [])
        all_extracted_entities.extend(current_entities)
        print(f"Run {i+1} (T={T:.1f}): Extracted {len(current_entities)} entities.")

    except Exception as e:
        print(f"Run {i+1} (T={T:.1f}) FAILED: {e}")
        continue 
        
    if i < len(temperatures) - 1:
        print("Pausing for 15 seconds to respect the Free Tier 2 RPM limit...")
        time.sleep(15) 

# --- AGGREGATION AND UNION (Initial Cleanup) ---
final_unique_entities = sorted(list(set(all_extracted_entities)))
print("\n--- RAW UNION RESULTS (Before Canonicalization) ---")
print(f"Unique entities to process: {len(final_unique_entities)}")

# --- STEP 2: GENERIC AI-POWERED CANONICALIZATION ---

# 1. Define the Cleanup Prompt
cleanup_prompt = f"""
    You are an expert data canonicalization system.
    You are given a list of raw named entities extracted from a document.
    Your task is to consolidate this list into a clean, precise, and final set.

    Rules:
    1.  **Merge Duplicates/Variations:** Combine all abbreviations (e.g., 'FCU') and their full names (e.g., 'Financial Crimes Unit (FCU)') into the single, most complete version (the canonical form).
    2.  **Resolve Co-references:** Merge specific references ('Sanctions Bill') into the full, official name ('Global Magnitsky Sanctions Bill').
    3.  **Prioritize Clarity:** For organizations that appear in different contexts (e.g., 'Titan', 'Titan Industries', 'Titan Collapse'), decide whether the organization ('Titan Industries') or the event ('Titan Collapse') is the more appropriate entity, and eliminate the raw, ambiguous shorter references. Keep distinct entities (like 'Titan Industries' and 'Titan Collapse') separate.
    4.  **Preserve Dates/Money:** Do not change dates or currency values.

    Return ONLY the final, canonical list of entities.

    Raw Entities to Clean:
    {json.dumps(final_unique_entities, indent=2)}
"""

# 2. Add a mandatory delay before the second API call
print("Pausing for 15 seconds before the Canonicalization API call...")
time.sleep(15)

# 3. Perform the second API call for cleanup (T=0.0 for deterministic output)
try:
    cleanup_config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=EntityList, # Reuse the simple List[str] schema
        temperature=0.0, # Use deterministic temperature for cleanup
    )
    
    cleanup_response = client.models.generate_content(
        model="gemini-2.5-pro", 
        contents=cleanup_prompt,
        config=cleanup_config
    )
    
    final_canonical_json = json.loads(cleanup_response.text)
    final_canonical_entities = final_canonical_json.get("entities", [])
    
    print("\n--- FINAL CANONICAL RESULTS (AI-Cleaned) ---")
    print(f"Final CLEAN entities: {len(final_canonical_entities)}")

except Exception as e:
    print(f"\nCANONICALIZATION FAILED: {e}")
    # Fallback to the raw union if cleanup fails
    final_canonical_entities = final_unique_entities
    print("Falling back to raw union list.")


# 5) Final Structured Output
final_output = {"entities": final_canonical_entities}
print("\nParsed JSON (Final AI-Canonicalized Union):\n", json.dumps(final_output, indent=2))

In [None]:
print(final_canonical_entities)

In [None]:
print("\nParsed JSON (Final Union):\n", json.dumps(final_output, indent=2))

In [None]:
import os
import json

# Assuming 'final_output' is defined and populated (e.g., {'entities': [...]})

# --- CODE TO SAVE TO FILE ---

# 1. Define the directory and filename
output_directory = "saved_stuff"
output_filename = "node_entities_output.json"

# 2. Construct the full file path
file_path = os.path.join(output_directory, output_filename)

# 3. Ensure the output directory exists (CRITICAL STEP)
# 'exist_ok=True' prevents an error if the directory already exists.
try:
    os.makedirs(output_directory, exist_ok=True)
except Exception as e:
    print(f"‚ùå Error creating directory: {e}")
    # If directory creation fails, the program will likely halt here.

# 4. Write the data to the file
try:
    # Open the file at the full path in write mode ('w')
    with open(file_path, 'w', encoding='utf-8') as f:
        # Use json.dump() to write the dictionary directly to the file object 'f'
        json.dump(final_output, f, indent=2)
        
    print(f"\n‚úÖ Success: Final output saved to {file_path}")
    
except Exception as e:
    print(f"\n‚ùå Error saving file: {e}")

# --- END SAVE CODE ---

We are now done with the extracting node entites using the gemini-2.5-pro, thats better than the trf and lg spacy models.

Now lets use our SaT (Segment any text model) to split the text into sentences

In [None]:
from wtpsplit import SaT

# --- 2. Load the SaT Model ---
# Use 'sat-3l-sm' for a good balance of quality and fast inference.
# The model will be downloaded automatically the first time this runs.
print("Loading SaT model...")
try:
    # Use the small/medium model for general sentence segmentation tasks
    sat = SaT("sat-3l-sm")
except Exception as e:
    print(f"Error loading WtpSplit model: {e}")
    # Handle the error or exit gracefully

# --- 3. Split the text ---
# The .split() method processes the text and returns a list of segmented sentences.
# By default, it handles newlines intelligently.
sentence_list = sat.split(sample_text)

print("\n--- Split Sentences (SaT) ---")
for i, sent in enumerate(sentence_list):
    # Strip whitespace/newlines that the model might leave at the start/end of sentences
    print(f"{i+1}: {sent.strip()}")

print("\nFinal Output Type:", type(sentence_list))

In [None]:
sentence_list = [sent.strip() for sent in sentence_list if sent.strip()]

In [None]:
print("\n--- Split Sentences (SaT) ---")
for i, sent in enumerate(sentence_list):
    # Strip whitespace/newlines that the model might leave at the start/end of sentences
    print(f"{i+1}: {sent.strip()}")

In [None]:
import os
import json
# Assuming 'sentence_list' is defined and populated earlier in your notebook.

# --- CODE TO SAVE TO FILE ---

# 1. Define the directory and filename
output_directory = "saved_stuff"
json_filename = "segmented_sentences.json"

# 2. Ensure the output directory exists
# This handles the case where 'saved_stuff' hasn't been created yet.
try:
    os.makedirs(output_directory, exist_ok=True)
except Exception as e:
    print(f"‚ùå Error creating directory: {e}")
    # If the directory can't be created, the save operation will fail later, 
    # but we handle this gracefully.

# 3. Construct the full file path
file_path = os.path.join(output_directory, json_filename)

# 4. Define the data structure you want to save
# Assuming paragraph_list is already populated
data_to_save = {
    "sentences": sentence_list 
}

# 5. Write the data to the file
try:
    with open(file_path, 'w', encoding='utf-8') as f:
        # Use json.dump() to write the dictionary directly to the file with formatting
        json.dump(data_to_save, f, indent=2)
        
    print(f"\n‚úÖ Success: Sentences saved to {file_path} (JSON Format)")
    
except Exception as e:
    print(f"\n‚ùå Error saving JSON file: {e}")

# --- END SAVE CODE ---

In [None]:
## Loading our nodes and sentences back:

import os
import json

# --- CODE TO RETRIEVE FILES ---

# 1. Define the directory
output_directory = "saved_stuff"

# --- FILE 1: NODE ENTITIES ---

node_filename = "node_entities_output.json"
node_file_path = os.path.join(output_directory, node_filename)
retrieved_nodes = None

try:
    print(f"Attempting to load entities from: {node_file_path}")
    
    with open(node_file_path, 'r', encoding='utf-8') as f:
        retrieved_nodes = json.load(f)
        
    print(f"‚úÖ Success: Entities loaded from {node_file_path}")
    
    # Access the list of entities
    entities_list = retrieved_nodes.get("entities", [])
    
    print("\n--- Entities Summary ---")
    print(f"Total entities retrieved: {len(entities_list)}")
    print("\nParsed JSON (Final Union):\n", json.dumps(entities_list, indent=2))
    
except FileNotFoundError:
    print(f"\n‚ùå Error: Entity file not found at {node_file_path}.")
except json.JSONDecodeError as e:
    print(f"\n‚ùå Error: Failed to parse entity JSON file. Error: {e}")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred while loading entities: {e}")


# --- FILE 2: SEGMENTED SENTENCES ---

# 1. Define the filename and path for the second file
sentence_filename = "segmented_sentences.json"
sentence_file_path = os.path.join(output_directory, sentence_filename)
retrieved_sentences = None

try:
    print(f"\nAttempting to load sentences from: {sentence_file_path}")
    
    with open(sentence_file_path, 'r', encoding='utf-8') as f:
        retrieved_sentences = json.load(f)
        
    print(f"‚úÖ Success: Sentences loaded from {sentence_file_path}")
    
    # Access the list of sentences
    sentences_list = retrieved_sentences.get("sentences", [])
    
    print("\n--- Sentences Summary ---")
    print(f"Total sentences retrieved: {len(sentences_list)}")
    print("\nParsed JSON (Final Union):\n", json.dumps(sentences_list, indent=2))
    
except FileNotFoundError:
    print(f"\n‚ùå Error: Sentence file not found at {sentence_file_path}.")
except json.JSONDecodeError as e:
    print(f"\n‚ùå Error: Failed to parse sentence JSON file. Error: {e}")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred while loading sentences: {e}")


## --- END RETRIEVE CODE ---

In [None]:
# Chunking sentences into paragraphs (This used a lexical cohesion and vocabulary shift) - The TextTilling 
# Assumes paragraphs end when the vocabulary changes significantly. It uses a windowing approach to score the similarity of adjacent blocks of text, identifying dips in similarity as segment boundaries.

import nltk
from nltk.tokenize.texttiling import TextTilingTokenizer
import os
import json

# --- Load the required NLTK data (if not already downloaded) ---
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('stopwords')

nltk.data.find('tokenizers/punkt')
nltk.data.find('taggers/averaged_perceptron_tagger')

# -------------------------------------------------------------


# --- 1. PREPARE THE INPUT ---
# TextTiling requires the input as a single string, with sentences separated by newlines.
# We also ensure proper spacing by removing leading/trailing whitespace.
# --- 1. PREPARE THE INPUT (FIXED) ---
# Use DOUBLE newlines to clearly demarcate text blocks/paragraphs.
text_input = "\n\n".join([s.strip() for s in sentences_list]) # <-- Changed to \n\n


# --- 2. INITIALIZE THE TEXTTILING TOKENIZER ---
# k=10: The size of the window used to calculate word cohesion score. 
#       Default is usually fine, but you can tune this.
# w=20: The size of the block used to measure similarity (typically half the number of sentences in the input)
ttt = TextTilingTokenizer(w=20, k=10)


# --- 3. PERFORM SEGMENTATION ---
# The tokenize method returns a list where each element is a segment (paragraph).
paragraph_list = ttt.tokenize(text_input)


# --- 4. PRINT RESULTS ---
print("\n--- TEXTTILING SEGMENTATION ---")
print(f"Original Sentences: {len(sentences_list)}")
print(f"Generated Paragraphs: {len(paragraph_list)}")
print("-" * 30)

for i, paragraph in enumerate(paragraph_list):
    # Clean up excess newlines or spaces from the segmented output
    cleaned_paragraph = paragraph.strip().replace('\n', ' ')
    print(f"\n[PARAGRAPH {i+1}]")
    print(cleaned_paragraph)
    
print("-" * 30)

# The final output is stored in the 'paragraph_list' variable.

In [None]:
import os
import json
# You would have defined paragraph_list here from the TextTiling output

# --- CODE TO SAVE TO FILE ---

# 1. Define the directory and filename
output_directory = "saved_stuff"
json_filename = "segmented_paragraphs.json"

# 2. Ensure the directory exists (CRITICAL STEP for subdirectories)
# This prevents a FileNotFoundError if 'saved_stuff' hasn't been created yet.
try:
    os.makedirs(output_directory, exist_ok=True)
except Exception as e:
    print(f"‚ùå Error creating directory: {e}")
    # Exit if directory cannot be created

# 3. Construct the full file path
file_path = os.path.join(output_directory, json_filename)

# 4. Define the data structure you want to save
# Assuming paragraph_list is already populated
data_to_save = {
    "paragraphs": paragraph_list 
}

# 5. Write the data to the file
try:
    with open(file_path, 'w', encoding='utf-8') as f:
        # Use json.dump() to write the dictionary directly to the file with formatting
        json.dump(data_to_save, f, indent=2)
        
    print(f"\n‚úÖ Success: Paragraphs saved to {file_path} (JSON Format)")
    
except Exception as e:
    print(f"\n‚ùå Error saving JSON file: {e}")


In [None]:
import os
import json
from typing import List, Dict, Any

# --- CODE TO RETRIEVE PARAGRAPHS ---

# 1. Define the directory and filename (matching your save location)
output_directory = "saved_stuff"
json_filename = "segmented_paragraphs.json"

# 2. Construct the full file path
file_path = os.path.join(output_directory, json_filename)

# Initialize the target variables
retrieved_data: Dict[str, Any] = {}
paragraphs_list: List[str] = []

print(f"Attempting to load paragraphs from: {file_path}")

# 3. Attempt to open and load the JSON file
try:
    # Open the file in read mode ('r')
    with open(file_path, 'r', encoding='utf-8') as f:
        # Use json.load() to deserialize the JSON content into the dictionary
        retrieved_data = json.load(f)
        
    # 4. Safely extract the list from the 'paragraphs' key
    paragraphs_list = retrieved_data.get("paragraphs", [])
    
    print(f"\n‚úÖ Success: Paragraphs successfully loaded from {file_path}")
    
    # --- Data Inspection and Explanation ---
    print("\n--- Retrieved Data Details ---")
    print(f"Total paragraphs retrieved: {len(paragraphs_list)}")
    
    for i, para in enumerate(paragraphs_list, start=1):
        print(f"Paragraph {i}")
        print(para)

    
except FileNotFoundError:
    print(f"\n‚ùå Retrieval Error: File not found at {file_path}.")
    print("Detail: Ensure the 'saved_stuff' folder exists and contains 'segmented_paragraphs.json'.")
    
except json.JSONDecodeError as e:
    print(f"\n‚ùå Retrieval Error: Failed to parse JSON file at {file_path}.")
    print(f"Detail: The file exists but contains invalid JSON syntax (e.g., missing quotes, commas, or mismatched brackets). Error: {e}")
    
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred: {e}")

# The 'paragraphs_list' variable now holds your recovered data (or an empty list if loading failed).
# print(paragraphs_list) # Uncomment this to see the full list directly

In [None]:
# Clean paragraphs: remove unnecessary newlines
cleaned_paragraphs = []
for p in paragraphs_list:
    # Replace multiple newlines with a single space
    cleaned = " ".join(p.split())
    cleaned_paragraphs.append(cleaned)

paragraphs_list = cleaned_paragraphs


In [None]:
import pprint 

pprint.pprint(retrieved_sentences)

In [None]:
N = entities_list
Extracted_Sentences = sentences_list
Paragraphs_List = paragraphs_list


In [None]:
pprint.pprint(N)

In [None]:
from pprint import pprint # (Beautifully prints lists, nested structures, dicts), its pretty print

In [None]:
for para in Paragraphs_List:
    pprint(para)
    print('\n')

In [None]:
N = {f"N{i}": name for i, name in enumerate(N, start=1)}
Extracted_Sentences = {f"S{i}": name for i, name in enumerate(Extracted_Sentences, start=1)}
Paragraphs_List = {f"P{i}": name for i, name in enumerate(Paragraphs_List, start=1)}

In [None]:
type(N)

In [None]:
pprint(Paragraphs_List)

In [None]:
for key in sorted(Extracted_Sentences.keys(), key=lambda x: int(x[1:])):
    print(key, ":", Extracted_Sentences[key])


In [None]:
pprint(Paragraphs_List)

In [None]:
import collections
from typing import Dict, Tuple, List, Any

# Initialize T_map: maps node_id to a list of (paragraph_id, sentence_id) tuples
T_map: Dict[str, List[Tuple[str, str]]] = collections.defaultdict(list)

print("Starting node-to-context alignment...")

# 1. Iterate through every node (entity)
for node_id, entity_text in N.items():
    # Sanitize the entity text to handle potential punctuation issues during matching
    entity_text_clean = entity_text.strip()
    
    # Check if the node is empty (unlikely but safe)
    if not entity_text_clean:
        continue

    # 2. Iterate through every sentence to find the direct source
    for sentence_id, sentence_text in Extracted_Sentences.items():
        
        # Check if the entity string is contained within the sentence text
        if entity_text_clean in sentence_text:
            
            # 3. If found in a sentence, find the paragraph that contains that sentence ID
            # This requires a nested check, as we don't have a direct S_ID -> P_ID map.
            # We must iterate through the paragraphs to determine which one contains the full sentence.
            
            paragraph_found = False
            for paragraph_id, paragraph_text in Paragraphs_List.items():
                
                # Check if the *full sentence* is contained within the paragraph text
                if sentence_text.strip() in paragraph_text:
                    
                    # 4. Record the mapping: Node ID -> (Paragraph ID, Sentence ID)
                    T_map[node_id].append((paragraph_id, sentence_id))
                    paragraph_found = True
                    # Optimization: Since the sentence should only belong to one paragraph, break the inner loop
                    break 

            # If the sentence wasn't found in any paragraph (e.g., due to fragmentation), skip recording.
            if not paragraph_found:
                 print(f"Warning: Sentence {sentence_id} containing '{entity_text_clean}' was not cleanly mapped to a paragraph.")

print("\n--- T_map Generation Complete ---")
print(f"Total nodes mapped: {len(T_map)}")
print("-" * 40)



In [None]:
# --- Final Output ---
# Display the T_map
import pprint
pprint.pprint(dict(T_map))

In [None]:
from typing import Dict, List, Any, Tuple
import itertools
import pprint

# Initialize Invert_T_map
Invert_T_map: Dict[str, Dict[str, Any]] = {}

# 1. Initialize the structure and copy paragraph content
for p_id, content in Paragraphs_List.items():
    Invert_T_map[p_id] = {
        'context': content,
        'Nodes': set(), 
        'pairs_to_check': [] # Initialize new field
    }

# 2. Iterate through the T_map to map nodes to paragraphs
for node_id, locations in T_map.items():
    for p_id, s_id in locations:
        if p_id in Invert_T_map:
            Invert_T_map[p_id]['Nodes'].add(node_id)


# 3. Convert the temporary sets back to lists and generate directed pairs
for p_id in Invert_T_map:
    # 3a. Get the sorted list of nodes
    nodes_set = Invert_T_map[p_id]['Nodes']
    nodes_list = sorted(list(nodes_set))
    
    # Initialize the list for all directed pairs
    all_directed_pairs: List[Tuple[str, str]] = []
    
    # 3b. Generate all unique, unordered pairs (combinations of size 2)
    # The output of combinations is a list of tuples, e.g., [('N1', 'N2'), ('N1', 'N3')]
    for pair in itertools.combinations(nodes_list, 2):
        # 1. Add the pair in the original order (A, B)
        all_directed_pairs.append(pair)
        
        # 2. Add the reverse pair (B, A) to create the directed pair set
        reverse_pair = (pair[1], pair[0])
        all_directed_pairs.append(reverse_pair)
    
    # 3c. Assign the final lists back to the map
    Invert_T_map[p_id]['Nodes'] = nodes_list
    Invert_T_map[p_id]['pairs_to_check'] = all_directed_pairs # Assigned the directed list


# --- Final Output ---
print("\n--- Invert_T_map Generation Complete (Directed Pairs) ---")
print(f"Total paragraphs mapped: {len(Invert_T_map)}")

pprint.pprint(Invert_T_map)

Extract our relations

In [None]:
import os
import json
import time
# Assuming these imports are available in your execution environment:
from typing import Dict, List, Any, Tuple, Optional
from google.genai import Client, types 
from pydantic import BaseModel, Field

# --- 1. DEFINE THE OUTPUT SCHEMAS ---
class RelationTriplet(BaseModel):
    """
    Defines the structure for a single extracted directional relationship.
    
    CRUCIAL EDIT: The description for 'relation_phrase' is changed to
    force a normalized, semantic relation type instead of a full sentence.
    """
    subject_id: str = Field(description="The ID of the source node (e.g., 'N13').")
    object_id: str = Field(description="The ID of the target node (e.g., 'N4').")
    relation_phrase: str = Field(description="A **short, normalized, semantic verb phrase** describing the relationship (e.g., 'TARGETS', 'CAUSED_BY', 'HAPPENED_ON'). Use past tense verbs where appropriate. **DO NOT** use the entire sentence. The phrase must clearly define the Subject's action toward the Object.")
    confidence: float = Field(description="The model's confidence in the relationship (0.0 to 1.0). **ONLY ASSIGN CONFIDENCE >= 0.9 IF the relationship is explicitly and clearly stated (or directly implied by a strong verb) in the text.**")

# FIX: Define a top-level wrapper class for the list
class FinalRelationList(BaseModel):
    """Wraps the list of relations to ensure reliable JSON Schema conversion."""
    relations: List[RelationTriplet] = Field(description="The list of extracted directional relational triplets.")
# ------------------------------------------------------------------------------------------------

# 2. CONFIGURE THE CLIENT
# Note: Assuming Invert_T_map, N, and pprint are defined globally in the execution environment
client = Client(api_key=os.environ.get("GEMINI_API_KEY", "YOUR_API_KEY")) # Use .get for robustness

# Initialize the final result structure
RELATION_MAP: Dict[str, List[Dict[str, Any]]] = {}
DELAY_SECONDS = 15 

# --- EDITED INSTRUCTIONS: These are the core changes ---
# 1. Force normalized, short verbs.
# 2. Add specific rules for modeling properties (like Date/Time).
# 3. Add directional awareness (modeling the difference between A->B and B->A).
EXTRACTION_INSTRUCTIONS = """
You are an expert Relational Extractor and Knowledge Graph Builder. Analyze the following structured input and extract only the most meaningful, directional relationships.

RULES:
1. **Normalized Relations:** The 'relation_phrase' MUST be a **short, normalized verb phrase** (e.g., 'TARGETED', 'CAUSED', 'LOCATED_AT'). DO NOT use the entire sentence.
2. **Directional Semantics:** Ensure the relation describes the Subject's action/state relative to the Object. (e.g., If 'I am the father of my dad', you output nothing. If 'My dad is the father of me', output: 'father_of').
3. **Property Modeling:** For temporal/spatial/quantitative relationships:
    - Dates (e.g., 'July 1st') or Money (e.g., '$500M') are NOT causal agents.
    - If Subject is a **Date** and Object is an **Event**, use: 'MARKED_THE_START_OF' or 'OCCURRED_ON'.
    - If Subject is an **Event** and Object is a **Date**, use: 'BEGAN_ON'.
4. **Confidence (DAG Filter):** Assign confidence >= 0.9 ONLY if the relationship is explicitly and unambiguously stated. Use a lower score (e.g., 0.5-0.8) for implied or weak (non-causal) relations like 'IS_A' or 'HAS_PROPERTY'.
5. **Output Format:** Provide output as a list: [{'subject_id': ID, 'object_id': ID, 'relation_phrase': 'SHORT_PHRASE', 'confidence': float}]
"""

print(f"--- Starting Relation Extraction for {len(Invert_T_map)} Paragraphs ---")

# 3. LOOP THROUGH EACH PARAGRAPH
for i, (p_id, p_data) in enumerate(Invert_T_map.items()):
    
    context = p_data['context']
    pairs_to_check = p_data['pairs_to_check']
    
    # 3a. Prepare input nodes for the prompt (ID and Name)
    node_details = [{"id": n_id, "name": N[n_id]} for n_id in p_data['Nodes']]

    # 3b. Construct the Structured Prompt (JSON Format)
    structured_input_data = {
        "context": context,
        "nodes": node_details,
        "pairs_to_check": pairs_to_check,
        # üö® EDITED: Use the new normalized instructions
        "instructions": EXTRACTION_INSTRUCTIONS 
    }
    
    # 3c. Final Prompt for the LLM
    final_extraction_prompt = f"""
    You are an expert Relational Extractor. Analyze the following structured input (JSON) 
    and extract all specified relationships based ONLY on the provided 'context'.

    Input Data (JSON):
    {json.dumps(structured_input_data, indent=2)}
    """
    
    # ... (Rest of the code for API call remains the same)

    # 3d. Configure the API Call
    config = types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=FinalRelationList, 
        temperature=0.0
    )

    try:
        print(f"\nProcessing {p_id} (Nodes: {len(p_data['Nodes'])}, Pairs: {len(pairs_to_check)})")
        
        # 3e. Execute the Query
        response = client.models.generate_content(
            model="gemini-2.5-flash", 
            contents=final_extraction_prompt,
            config=config
        )
        
        # 3f. Collect Results
        raw_json_output = json.loads(response.text)
        relations_list = raw_json_output.get("relations", [])
        RELATION_MAP[p_id] = relations_list
        
        print(f"‚úÖ Extracted {len(relations_list)} triplets from {p_id}.")

    except Exception as e:
        print(f"‚ùå Query for {p_id} FAILED: {e}")
        RELATION_MAP[p_id] = []
    
    # 4. RATE LIMIT CONTROL (Mandatory for Free Tier)
    if i < len(Invert_T_map) - 1:
        print(f"Pausing for {DELAY_SECONDS} seconds to respect 2 RPM limit...")
        time.sleep(DELAY_SECONDS) 

# --- FINAL AGGREGATION AND OUTPUT ---

print("\n\n--- FINAL RELATION MAP ---")
print(f"Total paragraphs processed: {len(RELATION_MAP)}")
print("-" * 35)

# Note: Assuming pprint is available for outputting the result
# pprint.pprint(RELATION_MAP)

In [None]:
pprint.pprint(RELATION_MAP)

In [None]:
# Save the above RELATION_MAP

import json
import os
import pprint

# --- Define the file path and directory ---
SAVE_DIR = "saved_stuff"
FILE_NAME = "relation_map_data.json"
FULL_PATH = os.path.join(SAVE_DIR, FILE_NAME)

# 1. Ensure the directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

# 2. Save the dictionary to JSON
try:
    with open(FULL_PATH, 'w') as f:
        # We use json.dump() to write the data to the file object 'f'
        # indent=4 makes the JSON human-readable and easy to debug
        json.dump(RELATION_MAP, f, indent=4)
    print(f"‚úÖ Successfully saved RELATION_MAP to: {FULL_PATH}")
except Exception as e:
    print(f"‚ùå Error saving file: {e}")

In [None]:
import json
import os
import pprint

# --- Define the file path and directory (must match save code) ---
SAVE_DIR = "saved_stuff"
FILE_NAME = "relation_map_data.json"
FULL_PATH = os.path.join(SAVE_DIR, FILE_NAME)

# Initialize the variable before loading
RELATION_MAP = {}

# 1. Load the dictionary from JSON
try:
    with open(FULL_PATH, 'r') as f:
        # We use json.load() to read the data from the file object 'f'
        RELATION_MAP = json.load(f)
    
    print(f"‚úÖ Successfully loaded RELATION_MAP from: {FULL_PATH}")
    print(f"Total paragraphs loaded: {len(RELATION_MAP)}")
    
    # Optional: Print a snippet of the loaded data to confirm
    print("\nSnippet of Loaded Data:")
    pprint.pprint(RELATION_MAP['P1'])
    
except FileNotFoundError:
    print(f"‚ùå Error: File not found at {FULL_PATH}. Did you run the save code first?")
except Exception as e:
    print(f"‚ùå Error loading file: {e}")

In [None]:
# Combine all of the above contents to form ExtractedTriplets
 
# Initialize an empty list to hold all triplets
ExtractedTriplets = []

# Use a list comprehension for the most efficient and Pythonic way to combine lists
# Iterates through the values (which are lists of triplets) in the RELATION_MAP dictionary
# and extends the ExtractedTriplets list.
for p_id, triplets_list in RELATION_MAP.items():
    ExtractedTriplets.extend(triplets_list)
    # Optional: You can also print the number of triplets added per paragraph
    # print(f"Added {len(triplets_list)} triplets from {p_id}")

print(f"‚úÖ Total combined triplets: {len(ExtractedTriplets)}")
# print("\nFirst 5 combined triplets:")
# print(ExtractedTriplets[:5]) # Uncomment to verify the data structure

In [None]:
pprint.pprint(ExtractedTriplets)

In [None]:
# Lets do our clustering

import numpy as np
import time
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Tuple


# --- 1. Data Extraction and Deduplication ---
relation_phrases = [t['relation_phrase'] for t in ExtractedTriplets]
unique_phrases = sorted(list(set(relation_phrases)))
num_unique_phrases = len(unique_phrases)

print(f"Total unique relation phrases found: {num_unique_phrases}")
# Example: ['ANNOUNCED_ON', 'BEGAN_ON', 'CAUSED_FINANCIAL_RUIN_FOR', ...]

# --- 2. Dynamic K Determination ---
K_MAX = 10
K = min(K_MAX, num_unique_phrases)

print(f"Clustering target K set to: {K}")

# --- 3. Embedding using Sentence Embedder f_embed ---
# We use all-MiniLM-L6-v2, a fast and highly effective embedding model.
# This model acts as the sentence embedder function f_embed.
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Starting phrase embedding...")
start_time = time.time()

# The resulting matrix contains the vector v_rel for each phrase.
embeddings = model.encode(unique_phrases, show_progress_bar=False)

end_time = time.time()
print(f"Embedding completed in {end_time - start_time:.2f} seconds.")
print(f"Embedding shape: {embeddings.shape}")

# --- 4. Clustering (K-Means) ---
ClusterMap: Dict[str, int] = {}

if K >= 1:
    # We use K-Means to cluster the semantic vectors.
    kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
    
    # Fit the model and predict the cluster labels (0-indexed)
    cluster_labels = kmeans.fit_predict(embeddings)

    # --- 5. Create ClusterMap and Assign 1-Based IDs ---
    # Map each phrase to its cluster label (0-indexed) and then convert to 1-based ID.
    for phrase, label in zip(unique_phrases, cluster_labels):
        # Assign unique number starting from 1
        ClusterMap[phrase] = int(label) + 1
else:
    # Handle the degenerate case where there are no unique phrases
    print("Warning: No unique phrases to cluster.")

# --- 6. Verification and Output ---
print("\n--- Final Cluster Map ---")
# Use a dictionary comprehension to group phrases by cluster ID for display
clusters_grouped = {}
for phrase, c_id in ClusterMap.items():
    if c_id not in clusters_grouped:
        clusters_grouped[c_id] = []
    clusters_grouped[c_id].append(phrase)

for c_id, phrases in sorted(clusters_grouped.items()):
    print(f"\nCluster {c_id}: (n={len(phrases)})")
    print("-------------------------")
    for phrase in phrases:
        print(f" - {phrase}")

print(f"\nClusterMap size: {len(ClusterMap)}")
# print("\nRaw ClusterMap Dictionary (First 5 items):")
# print(dict(list(ClusterMap.items())[:5]))

In [None]:
pprint.pprint(ClusterMap)

In [None]:
pprint.pprint(N)

In [None]:
import numpy as np
from scipy.sparse import coo_matrix
from typing import List, Dict, Any

# --- 0. Node mapping: N1..Nn -> integer indices (1..NUM_NODES), 0 unused ---
node_to_idx: Dict[str, int] = {}

for node_id_str in N.keys():
    # remove leading 'N'
    numerical_part = node_id_str[1:]
    node_index = int(numerical_part)  # e.g. "10" -> 10
    node_to_idx[node_id_str] = node_index

NUM_NODES = len(N)
MATRIX_DIM = NUM_NODES + 1  # keeping index 0 unused, as per your earlier design

# --- 1. Number of clusters ---
K_CLUSTER = max(ClusterMap.values())  # Cluster IDs are 1..K

print(f"Total number of unique nodes (N): {NUM_NODES}")
print(f"Total number of relation types (K): {K_CLUSTER}")

# --- 2. Initialize K builders for COO matrices ---
# Each W_k has its own data/row/col list
W_data: List[List[float]] = [[] for _ in range(K_CLUSTER)]
W_row:  List[List[int]]   = [[] for _ in range(K_CLUSTER)]
W_col:  List[List[int]]   = [[] for _ in range(K_CLUSTER)]

print("\nPopulating sparse matrix builders in a single pass...")

for triplet in ExtractedTriplets:
    relation_phrase = triplet['relation_phrase']
    confidence      = float(triplet['confidence'])

    cluster_id = ClusterMap.get(relation_phrase)
    if cluster_id is None:
        print(f"Warning: Phrase '{relation_phrase}' not found in ClusterMap. Skipping.")
        continue

    # cluster_id is 1-based; our list index is 0-based
    matrix_index = cluster_id - 1

    # subject/object indices
    row_idx = node_to_idx[triplet['subject_id']]
    col_idx = node_to_idx[triplet['object_id']]

    # append to that cluster's lists
    W_data[matrix_index].append(confidence)
    W_row[matrix_index].append(row_idx)
    W_col[matrix_index].append(col_idx)

# --- 3. Build K COO matrices from those lists ---
W_k_matrices: List[coo_matrix] = []

print("\nConstructing final sparse matrices (W_1 to W_K)...")

for k in range(K_CLUSTER):
    # Even if the lists are empty, coo_matrix will just give an empty matrix
    W_k = coo_matrix(
        (W_data[k], (W_row[k], W_col[k])),
        shape=(MATRIX_DIM, MATRIX_DIM)
    )
    W_k_matrices.append(W_k)
    print(f"W_{k+1} (Cluster {k+1}): "
          f"Shape={W_k.shape}, Entries={W_k.nnz}, Data Sum={W_k.data.sum():.4f}")

print(f"\nSuccessfully created {len(W_k_matrices)} sparse matrices.")


In [None]:
import os
from scipy.sparse import save_npz, coo_matrix
from typing import List

# --- Assumed Variables (W_k_matrices is available from the execution) ---
# W_k_matrices: List[coo_matrix]
# -----------------------------------------------------------------------

def save_sparse_matrices(matrices: List[coo_matrix], directory: str = "saved_stuff", filename: str = "sparse_matrices.npz"):
    """
    Saves a list of sparse matrices to a single compressed .npz file
    by packing them into a dictionary and saving the dictionary.

    Args:
        matrices: The list of scipy.sparse matrices (W_1, W_2, ...).
        directory: The folder where the file will be saved.
        filename: The name of the compressed file.
    """
    if not matrices:
        print("Warning: Matrix list is empty. Nothing to save.")
        return

    # 1. Ensure the directory exists
    os.makedirs(directory, exist_ok=True)
    full_path = os.path.join(directory, filename)

    # 2. Pack the matrices into a dictionary where keys are W_0, W_1, ...
    # This structure is necessary for loading them back correctly.
    # Note: save_npz expects the first argument to be the filename,
    # and subsequent arguments (kwargs) to be the named arrays.
    matrix_dict = {f"W_{i}": m for i, m in enumerate(matrices)}
    
    try:
        # Use the standard method for saving named arrays in .npz format.
        # This is the standard, cross-version-compatible way to save multiple items.
        np.savez_compressed(full_path, **matrix_dict)
        print(f"‚úÖ Successfully saved {len(matrices)} matrices to: {full_path}")
        
    except Exception as e:
        # Re-raise the error for better debugging if it's not the kwargs issue
        print(f"‚ùå Error saving matrices: {e}")

# Example Usage (You would call this after the final matrix construction):
save_sparse_matrices(W_k_matrices)

In [None]:
import os
import numpy as np
from scipy.sparse import load_npz, coo_matrix, spmatrix
from typing import List, Dict

def load_sparse_matrices(directory: str = "saved_stuff", filename: str = "sparse_matrices.npz") -> List[spmatrix]:
    """
    Loads sparse matrices saved using np.savez_compressed (with named keys W_0, W_1, ...) 
    and returns them as a list of scipy.sparse matrix objects.

    Args:
        directory: The folder where the file is located.
        filename: The name of the compressed file.

    Returns:
        A list of loaded scipy.sparse matrix objects.
    """
    full_path = os.path.join(directory, filename)
    loaded_matrices: List[spmatrix] = []

    try:
        # 1. Load the compressed data file using numpy.load
        # The 'allow_pickle=True' is often required when loading .npz files containing complex objects
        with np.load(full_path, allow_pickle=True) as loaded_archive:
            
            # 2. Iterate through the expected keys (W_0, W_1, ...)
            i = 0
            while True:
                key = f"W_{i}"
                if key in loaded_archive.keys():
                    # loaded_archive[key] is the sparse matrix object saved
                    loaded_matrices.append(loaded_archive[key])
                    i += 1
                else:
                    break
                    
            print(f"‚úÖ Successfully loaded {len(loaded_matrices)} matrices from: {full_path}")
            return loaded_matrices

    except FileNotFoundError:
        print(f"‚ùå Error: File not found at {full_path}. Please check the path and filename.")
        return []
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return []

# Example Usage:
W_k_matrices = load_sparse_matrices()
if W_k_matrices:
    print(f"First loaded matrix shape: {W_k_matrices[0].shape}")

In [None]:
import pprint

print("\n--- Raw W_k COO matrices ---")
pprint.pprint(W_k_matrices)


In [None]:
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
from typing import List

def finalize_relation_matrix(
    data_list: List[float],
    row_list: List[int],
    col_list: List[int],
    n_nodes: int
) -> csr_matrix:
    """
    Build an N x N sparse matrix where, for each (row, col), we store
    the MAX of all confidence scores seen for that edge in this cluster.
    """
    if not data_list:
        # return empty N x N matrix
        return csr_matrix((n_nodes, n_nodes), dtype=np.float32)

    # DOK is great for incremental assignment
    W_dok = dok_matrix((n_nodes, n_nodes), dtype=np.float32)

    for data, row, col in zip(data_list, row_list, col_list):
        value = float(data)
        current_max = W_dok[row, col]  # 0.0 if not yet set
        if value > current_max:
            W_dok[row, col] = value

    # convert to CSR for fast downstream ops
    return W_dok.tocsr()


In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from typing import List

Final_Aggregated_W: List[csr_matrix] = []

print("\n--- Starting Max Aggregation for All Clusters ---")

for k_index in range(K_CLUSTER):
    raw_data_list = W_data[k_index]
    row_list_k    = W_row[k_index]
    col_list_k    = W_col[k_index]

    if not raw_data_list:
        # No edges in this cluster: just push an empty matrix
        empty_csr = csr_matrix((MATRIX_DIM, MATRIX_DIM), dtype=np.float32)
        Final_Aggregated_W.append(empty_csr)
        print(f"‚úÖ Final W_{k_index + 1}: EMPTY matrix, Shape={empty_csr.shape}")
        continue

    try:
        data_array_k = np.asarray(raw_data_list, dtype=np.float32)
    except Exception as e:
        print(f"‚ùå ERROR in W_{k_index + 1} data conversion: {e}")
        continue

    # Call aggregation: note MATRIX_DIM = NUM_NODES + 1
    Final_W_k_CSR = finalize_relation_matrix(
        data_array_k.tolist(),
        row_list_k,
        col_list_k,
        MATRIX_DIM
    )

    Final_Aggregated_W.append(Final_W_k_CSR)

    print(f"‚úÖ Final W_{k_index + 1}: "
          f"Shape={Final_W_k_CSR.shape}, "
          f"Unique Edges={Final_W_k_CSR.nnz}, "
          f"Total Weight={Final_W_k_CSR.data.sum():.4f}")

print(f"\nSuccessfully created {len(Final_Aggregated_W)} final, maximized CSR matrices.")


In [None]:
from scipy.sparse import coo_matrix

print("\n--- Final Maximized Relational Matrices (W_k) ---")
print(f"Total Matrices in List: {len(Final_Aggregated_W)}\n")

for k_index, final_Wk in enumerate(Final_Aggregated_W):
    cluster_id = k_index + 1
    print()
    print(f"**Matrix W_{cluster_id} (Cluster {cluster_id})**")
    print(f"Shape: {final_Wk.shape}, "
          f"Unique Edges (nnz): {final_Wk.nnz}, "
          f"Total Weight: {final_Wk.data.sum():.4f}")

    if final_Wk.nnz == 0:
        print("  [Matrix is empty (no relationships in this cluster)]")
        continue

    coo_wk: coo_matrix = final_Wk.tocoo()

    print("  Non-Zero Edges (Subject Index -> Object Index | Max Confidence Score):")
    for r, c, d in zip(coo_wk.row, coo_wk.col, coo_wk.data):
        print(f"  ({r} -> {c}) | Score: {d:.4f}")


In [None]:
pprint.pprint(Final_Aggregated_W)

In [None]:
A_w = Final_Aggregated_W
N = N
T_map = T_map

# 3.3 $C_{prior}$ generation

The goal here is to create a sparse answer key $C_{prior}$ (Our causal prior).

Our Cprior is kinda a high-quality ‚Äùtraining dataset‚Äù that tlls us the true causal links in the document.
Since we have no human labels, we must generate the dataset ourselves. This is a self-supervised process.
The core idea is to use a large, powerful ‚ÄùTeacher‚Äù LLM (e.g., gemini-2.5-flash) to perform complex causal reasoning.
The output of this phase is the Cprior (Causal Prior)

## Active Candidate-Set Expansion (ACE)

We cant query LLM with all pairs of nodes: O($N^2$).. So prune them to a list of plausible candidate pairs $E_{prior}$. This small list is what we send to our LLM.. This is done in 2 stages:
1. Structural Filter (GAE)
2. Semantic Filter

### 1. Structural Filter (GAE):

Our initial A_w are myopic (They only contain local 1-hop links within a paragraph). We need an unsupervised way to find pairs (i, j) that are strongly connected via multi-hop paths, as these are highly plausible for causal relationships.

In [None]:
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, hstack, vstack
from typing import List

# Assuming W_k_final_matrices is the list of K final, maximized CSR matrices.
# We also assume that NUM_NODES is the correct dimension (e.g., 14 or 15).

def create_co_occurrence_matrix(W_k_matrices: List[csr_matrix], N: int) -> csr_matrix:
    """
    Creates the binary, symmetric co-occurrence scaffolding graph (A_co-occur).

    Args:
        W_k_matrices: List of K aggregated relational matrices (W_1 to W_K).
        N: The dimension N (total number of nodes).

    Returns:
        The A_co-occur matrix (N x N, binary, symmetric).
    """
    if not W_k_matrices:
        print("Warning: Input matrix list is empty. Returning empty N x N matrix.")
        return csr_matrix((N, N), dtype=np.int8)

    print(f"Starting creation of A_co-occur (N={N})...")
    
    # 1. Binary Union (Logical OR)
    # The sum of all W_k matrices performs a union of all non-zero entries.
    # We initialize the union matrix with the first W_k matrix.
    A_union = W_k_matrices[0].copy()
    
    for Wk in W_k_matrices[1:]:
        # Standard sparse matrix addition combines all non-zero positions.
        A_union = A_union + Wk

    # 2. Binarization
    # Since any Wk(i, j) > 0 satisfies the condition, we convert all positive values to 1.
    A_binary = A_union.sign()
    
    # 3. Symmetrization (A + A^T)
    # This ensures that if a directed relation (i -> j) exists, the undirected
    # co-occurrence (i, j) is recorded regardless of direction.
    A_transpose = A_binary.transpose()
    A_symmetric_union = A_binary + A_transpose

    # 4. Final Binarization (Handles values like 2, which result from A(i,j) + A(j,i))
    A_co_occur = A_symmetric_union.sign().astype(np.int8)
    
    print(f"‚úÖ A_co-occur created. Entries={A_co_occur.nnz}, Symmetrization successful.")
    return A_co_occur

# --- Example Usage (Assuming variables are available) ---

# Note: You must ensure NUM_NODES is the correct total dimension (N)
A_co_occur = create_co_occurrence_matrix(Final_Aggregated_W, NUM_NODES)
print(f"Final A_co-occur Shape: {A_co_occur.shape}")

In [None]:
print(A_co_occur)

In [None]:
## Save the above A_co_occur

import os
from scipy.sparse import save_npz

# Ensure directory exists
os.makedirs("saved_stuff", exist_ok=True)

save_path = "saved_stuff/A_co_occur.npz"

# A_co_occur is your csr_matrix
save_npz(save_path, A_co_occur)

print(f"Saved A_co_occur sparse matrix to: {save_path}")


In [None]:
import numpy as np
from scipy.sparse import identity, csr_matrix

# Assuming A_co_occur is the final CSR matrix generated previously.
# If A_co_occur is not defined, this code assumes it has been loaded or created.

def calculate_A_hat(A_co_occur: csr_matrix) -> csr_matrix:
    """
    Calculates the normalized adjacency matrix with self-loops, A_hat.
    A_hat = A_co_occur + I, where I is the Identity Matrix.
    
    Args:
        A_co_occur: The unweighted, symmetric co-occurrence matrix (N x N).
        
    Returns:
        The A_hat matrix in CSR format.
    """
    
    # 1. Get the dimension N
    N = A_co_occur.shape[0]
    
    # 2. Create the Identity Matrix I (in sparse format)
    # The identity matrix is N x N, with ones on the diagonal.
    I = identity(N, dtype=A_co_occur.dtype, format='csr')
    
    # 3. Calculate A_hat = A_co-occur + I
    # Sparse matrix addition handles the union of the two sets of non-zero entries.
    A_hat = A_co_occur + I
    
    # Ensure the result is still binary (0 or 1), although A_co_occur should not have values > 1.
    # The sum of 1 (from A_co-occur) and 1 (from I) on the diagonal will result in 2.
    # We must ensure the result is clipped back to 1.
    A_hat_binary = A_hat.sign()
    
    # Convert back to the desired CSR integer format
    return A_hat_binary.astype(np.int8)

# --- Example Usage (Assuming A_co_occur is available) ---
A_hat = calculate_A_hat(A_co_occur)

print(f"‚úÖ A_hat successfully created.")
print(f"A_hat Shape: {A_hat.shape}")
print(f"Non-Zero Entries (nnz): {A_hat.nnz}")

In [None]:
import numpy as np
from scipy.sparse import csr_matrix, diags
from typing import List

def calculate_D_hat(A_hat: csr_matrix) -> csr_matrix:
    """
    Calculates the diagonal degree matrix D_hat from A_hat.
    
    Fix: Ensures the degree vector is explicitly flattened to 1D before diags().
    """
    
    # 1. Calculate Row Sums (The Degree Vector)
    # A_hat.sum(axis=1) returns a dense numpy.matrix of shape (N, 1).
    sum_matrix = A_hat.sum(axis=1)
    
    # 2. Flatten the result into a true 1D NumPy array (shape (N,))
    # The .A1 attribute is the most robust way to flatten the result of a sparse matrix sum.
    degree_vector = sum_matrix.A1 
    
    # 3. Construct the Diagonal Matrix D_hat
    # diags now receives a proper 1D array for the diagonal.
    D_hat = diags(degree_vector, format='csr')
    
    print("Degree Vector Shape (N):", degree_vector.shape)
    print(f"D_hat diagonal values (first 5): {degree_vector[:5].tolist()}")
    
    return D_hat

# --- Example Usage ---
D_hat = calculate_D_hat(A_hat)

print(f"‚úÖ D_hat matrix created.")
print(f"D_hat Shape: {D_hat.shape}")

In [None]:
pprint.pprint(N)

# We ideally also need the N0 node (which is just a 0 vector) -> Useful for calculations

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Dict, List, Any

# 1. ADD N0 to the dictionary N
N['N0'] = 'PLACEHOLDER_ZERO_INDEX'
print(f"Total nodes in N dictionary (N0 to N14): {len(N)}")


# 2. Prepare Data
# Sort keys naturally: N0, N1, N2, ..., N10, N11, ...
# N0 will be sorted to the front because its key lambda evaluates to 0.
sorted_node_keys = sorted(N.keys(), key=lambda x: int(x[1:]) if x != 'N0' else 0)
node_names = [N[key] for key in sorted_node_keys]

# 3. Load Pretrained Model
model = SentenceTransformer('all-MiniLM-L6-v2')

print(f"Loaded embedding model. Vector dimension: {model.get_sentence_embedding_dimension()}")
print("Starting node name embedding...")

# 4. Generate Embeddings (v_node)
# This processes all 15 node names. N0's vector is at index 0.
node_embeddings = model.encode(node_names, show_progress_bar=True, convert_to_numpy=True)

# --- Final Step: Overwrite N0 Embedding with Zero Vector ---
# The N0 placeholder is guaranteed to be at index 0 due to the sorting key.
node_embeddings[0, :] = 0.0

print(f"\n‚úÖ N0 ('{N['N0']}') embedding vector successfully set to zero.")

# 5. Store and Verify
print("\n--- Node Embedding Results ---")
print(f"Total number of nodes (N): {len(node_names)}")
print(f"Embedding Matrix Shape: {node_embeddings.shape}") 

NodeEmbeddingMap: Dict[str, np.ndarray] = {
    key: node_embeddings[i] for i, key in enumerate(sorted_node_keys)
}

# Verification print
print(f"Verification: N0 vector sum is {np.sum(NodeEmbeddingMap['N0']):.4f}")
print(f"Example Embedding (N1: {N[sorted_node_keys[1]]}):")
print(NodeEmbeddingMap['N1'][:5])

In [None]:
# üö® FIX: Custom printing loop to show the dictionary keys in numerical order
print("\n--- NodeEmbeddingMap (Displayed in Correct Numerical Order) ---")

# We reuse the numerically sorted keys list to print the map
for key in sorted_node_keys:
    # Print the key, and a snippet of the vector
    vector_snippet = NodeEmbeddingMap[key][:3]
    print(f"'{key}': array({vector_snippet.tolist()}, ...),")

In [None]:
import numpy as np
from typing import Dict, List, Any

# Assuming NodeEmbeddingMap is defined and populated with your 15 node vectors.

# --- Helper Function for Numerical Sorting ---
def get_node_index(key: str) -> int:
    """Extracts the numerical suffix (e.g., 10 from N10) for sorting."""
    if key == 'N0':
        return 0
    # Uses int() casting to ensure N10 is treated as 10, not 1, for sorting
    return int(key[1:])

# 1. Prepare Keys in Numerical Order
# This ensures N0 is row 0, N1 is row 1, N14 is row 14, aligning with matrix indices.
sorted_keys_numeric = sorted(NodeEmbeddingMap.keys(), key=get_node_index)

# 2. Construct the Matrix X by Vertical Stacking
# We retrieve the vector corresponding to each sorted key and stack them vertically.
X = np.vstack([NodeEmbeddingMap[key] for key in sorted_keys_numeric])

# 3. Verification
print("--- Final Node Feature Matrix (X) ---")
print(f"‚úÖ Matrix X successfully created.")
print(f"Shape N x d_in: {X.shape}") 
# Expected shape: (15, 384)
print(f"Data Type: {X.dtype}") 

# Verification of the N0 row (should be all zeros):
# print(f"Row 0 (N0 Placeholder) sum: {np.sum(X[0]):.4f}")

Pre-computation and GAE Model Definition:

This section defines the helper function to calculate the normalized adjacency matrix $\hat{D}^{-\frac{1}{2}}\hat{A}\hat{D}^{-\frac{1}{2}}$ and the PyTorch module for the GAE.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GAE
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler

# --- Hyperparameters and Dimensions ---
N_NODES = 15
D_IN = 384
D_HIDDEN = 128
D_LATENT = 64
LEARNING_RATE = 0.005
WEIGHT_DECAY = 5e-4
NUM_EPOCHS = 500   # 500 is enough to see behaviour


# --- 1. Model Definitions ---

class GCNEncoder(nn.Module):
    """Two-layer GCN encoder for GAE."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels, cached=True)
        self.conv2 = GCNConv(hidden_channels, out_channels, cached=True)

        # Optional: explicit Glorot init (PyG already uses good defaults)
        self.reset_parameters()

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


# --- 2. Data Preparation (WITH FEATURE SCALING) ---

def prepare_pyg_data(X_np: np.ndarray, A_cooccur_csr: csr_matrix):
    """
    Converts NumPy/SciPy data into the PyTorch Geometric Data object,
    applying necessary feature scaling (StandardScaler) for stability.
    """

    # Scale node features
    scaler = StandardScaler()
    X_np_scaled = scaler.fit_transform(X_np)

    # ---- IMPORTANT: Make adjacency symmetric & remove self loops ----
    A = A_cooccur_csr
    # Symmetrize: A_undirected = A OR A^T
    A_sym = ((A + A.T) > 0).astype(np.float32)

    # Let GCNConv add self-loops itself; don't add I here
    # Use PyG helper to create edge_index
    edge_index, _ = from_scipy_sparse_matrix(A_sym)

    x = torch.from_numpy(X_np_scaled).float()
    data = Data(x=x, edge_index=edge_index)

    return data


# --- 3. Training Function ---

def train_pyg_gae(data: Data, epochs: int):
    
    encoder = GCNEncoder(
        in_channels=D_IN,
        hidden_channels=D_HIDDEN,
        out_channels=D_LATENT
    )
    model = GAE(encoder)

    optimizer = optim.Adam(model.parameters(),
                           lr=LEARNING_RATE,
                           weight_decay=WEIGHT_DECAY)

    print(f"Training GAE for {epochs} epochs (lr={LEARNING_RATE}, decay={WEIGHT_DECAY})...")

    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()

        z = model.encode(data.x, data.edge_index)

        # recon_loss expects a positive edge_index of the (unweighted) graph
        loss = model.recon_loss(z, data.edge_index)

        loss.backward()
        optimizer.step()

        if epoch % 50 == 0 or epoch == 1:
            print(f"Epoch {epoch:03d}, Loss: {loss.item():.4f}")

    # Return final embeddings
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.edge_index)

    return z


# --- 4. Execution Block ---

# Option A: your original random graph (will not give meaningful loss trend)
X_np_sim = np.random.rand(N_NODES, D_IN).astype(np.float32)
A_cooccur_sim = csr_matrix(
    np.random.randint(0, 2, size=(N_NODES, N_NODES), dtype=np.int8)
)

# Make sure nonzero entries are 1
A_cooccur_sim.data[:] = 1

pyg_data = prepare_pyg_data(X_np_sim, A_cooccur_sim)

Final_Embeddings_Z = train_pyg_gae(pyg_data, epochs=NUM_EPOCHS)

print("-" * 40)
print("‚úÖ PyG GAE Training Successfully Executed.")
print(f"Final Latent Embedding Matrix Z Shape: {Final_Embeddings_Z.shape}")


In [None]:
Z = Final_Embeddings_Z

In [None]:
## Saving the above

import os
import torch

# Make sure directory exists
os.makedirs("saved_stuff", exist_ok=True)

save_path = "saved_stuff/Z.pt"

# Z = Final_Embeddings_Z  # Ensure Z exists
torch.save(Z, save_path)

print(f"Saved GAE embeddings Z to: {save_path}")


In [None]:
import numpy as np
import faiss
from typing import List, Tuple, Set, Dict, Any
# Assuming Final_Embeddings_Z is defined from the successful PyG training run.

# --- Define Hyperparameters ---
SCALING_RATIO = 0.30

# --- Execution of ANN and Candidate Generation ---

# 1. Prepare Data and Dimensions
Z_np = Final_Embeddings_Z.cpu().numpy().astype('float32')
N_NODES_ACTUAL = Z_np.shape[0] # N (Total number of nodes, e.g., 15)
D_LATENT = Z_np.shape[1] 

# 2. Calculate Dynamic K_PRIME
K_PRIME_SCALED = int(N_NODES_ACTUAL * SCALING_RATIO) 
K_PRIME = max(1, min(K_PRIME_SCALED, N_NODES_ACTUAL - 1))

# 3. Build the ANN Index (FAISS)
print(f"Total Nodes (N): {N_NODES_ACTUAL}, K': {K_PRIME}")
print("Building FAISS Index...")
index = faiss.IndexFlatL2(D_LATENT) 
index.add(Z_np) 

# 4. Query the Index
D, I = index.search(Z_np, K_PRIME + 1) 

# 5. Generate Candidate Set C1
C1: Set[Tuple[int, int]] = set()
raw_candidate_list: List[Dict[str, Any]] = []

print("Populating Candidate Set C1, excluding N0 (index 0)...")

for i in range(N_NODES_ACTUAL):
    neighbors_i = I[i]
    
    # We iterate over the indices starting from 1 (skipping index 0, which is node i)
    for neighbor_idx in neighbors_i[1:]: 
        j = neighbor_idx
        
        # üö® FIX: Exclude any pair involving the placeholder node (index 0)
        if i == 0 or j == 0:
            continue
            
        if i == j: # Should not happen after skipping neighbors_i[0]
            continue
            
        pair = (i, j)
        
        if pair not in C1:
            C1.add(pair)
            
            # Find the position of j within the neighbors_i array to get D[i, position]
            # Use a robust way to find the index position for the distance matrix D
            pos_in_search = np.where(neighbors_i == j)[0][0]
            
            raw_candidate_list.append({
                "source_idx": i,
                "target_idx": j,
                "distance": D[i, pos_in_search]
            })

# --- Final Verification and Output ---

print("\n--- Candidate Set C1 Results ---")
print(f"Total Unique Nodes (N): {N_NODES_ACTUAL}")
print(f"k' (Neighbors Searched): {K_PRIME}")
print(f"Size of Candidate Set C1 (|C1|): {len(C1)} pairs")

# Display a snippet of the found candidates
print("\nSnippet of Candidates:")
for candidate in raw_candidate_list:
    distance_score = np.sqrt(candidate['distance']).item() 
    print(f"  {candidate['source_idx']} -> {candidate['target_idx']} | Dist: {distance_score:.4f}")

In [None]:
pprint.pprint(raw_candidate_list)

In [None]:
K1: List[Tuple[int, int]] = []

for candidate in raw_candidate_list:
    # Extract the source and target indices and store as a tuple (i, j)
    source = candidate['source_idx']
    target = candidate['target_idx']
    
    K1.append((source, target))


In [None]:
pprint.pprint(K1)

In [None]:
## Save K1 to the saved stuff directory

import json
import os

# Ensure the directory exists
os.makedirs("saved_stuff", exist_ok=True)

# Convert np.int64 ‚Üí int for JSON serialization
def convert_k1_to_jsonable(K1):
    return [(int(a), int(b)) for (a, b) in K1]

jsonable_K1 = convert_k1_to_jsonable(K1)

save_path = "saved_stuff/k1.json"

with open(save_path, "w") as f:
    json.dump(jsonable_K1, f, indent=2)

print(f"Saved K1 to {save_path}")


### 2. Semantic Filter 

Here our basic goal is to filter our K1 "structurally plausible set" down to a small K2 "semantically plausible set" 

In [None]:
pprint.pprint(sentences_list)

Our interesting RAG-HyDe-RAV pipeline is below

In [None]:
import os
import json
import time
import numpy as np
import faiss
import torch
from typing import List, Tuple, Set, Dict, Any, Optional



# External Libraries required: google-genai, sentence-transformers, faiss-cpu, numpy, scikit-learn
from google.genai import Client, types
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from pydantic import BaseModel
from typing import Literal


# --- HYPERPARAMETERS ---
K_HYPOTHETICAL = 3      # Number of causal hypotheses to generate per pair
R_STOCHASTIC_SAMPLES = 5 # Number of samples for Semantic Entropy estimation
K_RAG = 3               # Top-k snippets retrieved for verification
K_BASE = 5              # Base pool size for RAG-MMR
K_EXPANSION = 5         # Pool expansion factor for uncertainty
LAMBDA_MMR = 0.5        # MMR trade-off: 0.5 favors both relevance and diversity
KAPPA_RRF = 60          # RRF constant
TAU_SUPPORT = 0.80      # Min confidence (p_support) threshold for verification
TAU_ENTROPY = 0.15      # Max Semantic Entropy threshold (low entropy = stable)
DELAY_SECONDS = 1        # Delay between API calls for stability (Can be 15s for free tier)
LLM_MODEL = 'gemini-2.5-flash'


# --- ASSUMED INPUTS (Must be defined in the execution environment) ---
# N (Dict[str, str]): Node ID to Node Name map (e.g., {'N1': '$500M', ...})
# C1 (Set[Tuple[int, int]]): The candidate set of 0-indexed plausible pairs (i, j)
# sentences_list (List[str]): The source document broken into sentences
# Z_np (np.ndarray): The N x dz latent embedding matrix from GAE/ANN
# node_to_idx (Dict[str, int]): Map from N1 -> 1, N2 -> 2, etc. (for coordinate lookup)

# ---------------------------------------------------------------------

# Initialize Client and Embedder
client = Client(api_key=os.environ["GEMINI_API_KEY"])
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# ====================================================================
# PHASE 1: PREPARATION & INDEXING
# ====================================================================


def setup_document_index(sentences: List[str]) -> Tuple[faiss.IndexFlatL2, np.ndarray]:
    """
    VD = Document Vector Store, idx_doc = FAISS index built on VD.
    """
    print("1. Embedding document sentences (VD)...")
    VD = embedder.encode(sentences, convert_to_numpy=True)

    # Build ANN Index (idx_doc)
    d_embed = VD.shape[1]
    idx_doc = faiss.IndexFlatL2(d_embed)
    idx_doc.add(VD)
    print(f"2. FAISS Index built on M={len(sentences)} sentences.")

    return idx_doc, VD


# Pre-run the setup (Requires sentences_list to be defined)
# idx_doc, VD = setup_document_index(sentences_list)

# ====================================================================
# PHASE 2: LLM CALLS AND UNCERTAINTY ESTIMATION
# ====================================================================

def llm_generate_hypotheses(node_i_name: str, node_j_name: str) -> List[str]:
    """f_hypothetical: Generates k_hypothetical causal claims."""

    prompt = f"""
        Generate {K_HYPOTHETICAL} short, hypothetical sentences describing a plausible causal connection
        between '{node_i_name}' (Cause) and '{node_j_name}' (Effect). Start each sentence with 'It is plausible that'.
    """

    response = client.models.generate_content(
        model=LLM_MODEL,
        contents=prompt,
        config=types.GenerateContentConfig(
            temperature=0.8
        )
    )

    # print(f"response = {response}")
    # --- Safely extract text from response ---
    raw_text = ""

    # Preferred: if response.text exists and is non-empty
    raw_text = response.text
    # print(f"raw_text = {raw_text}")

    if not raw_text.strip():
        print("[llm_generate_hypotheses] Empty LLM response; returning no hypotheses.")
        return []

    # Simple heuristic to parse the generated sentences
    hypotheses = [
        line.strip()
        for line in raw_text.split("\n")
        if line.strip() and "It is plausible that" in line
    ]

    # pprint.pprint(hypotheses)
    return hypotheses[:K_HYPOTHETICAL]


class VerifySupport(BaseModel):
    """Schema for claim verification result."""
    support: Literal["YES", "NO"]


def llm_verify_claim(claim: str, evidence: str, temperature: float) -> Tuple[float, str]:
    """f_verify: Verifies factual support and returns confidence + the model's primary choice."""

    prompt_verify = f"""
        Claim: '{claim}'.
        Evidence: '{evidence}'.
        Based ONLY on the Evidence, does the Evidence strongly support the Claim?
    """

    # üîí Let Gemini enforce the schema for us
    response = client.models.generate_content(
        model=LLM_MODEL,
        contents=prompt_verify,
        config=types.GenerateContentConfig(
            temperature=temperature,
            response_mime_type="application/json",
            response_schema=VerifySupport,  # <- Pydantic model, not raw dict
        ),
    )

    pprint.pprint(response)
    choice = "NO"

    try:
        print("checkpoint 1")
        parsed = getattr(response, "parsed", None)
        print("checkpoint 2, parsed =", parsed)

        support_val = None

        # A) Best case: parsed is already a VerifySupport instance
        if isinstance(parsed, VerifySupport):
            print("checkpoint 3 (VerifySupport instance)")
            support_val = parsed.support

        # Final sanity check
        if support_val not in ("YES", "NO"):
            raise ValueError("Could not find valid 'support' field in structured response")

        choice = support_val

    except Exception as e:
        print(f"[llm_verify_claim] Failed to parse structured response, defaulting to NO: {e}")
        choice = "NO"

    p_support = 1.0 if choice == "YES" else 0.0
    return p_support, choice


def estimate_semantic_entropy(claim: str, evidence: str) -> Tuple[float, float]:

    """Calculates p_support (mean confidence) and H_semantic via R stochastic passes."""

    # We use a moderate temperature for sampling (e.g., T=0.7)
    TEMPERATURE = 0.7

    # We will simply count the YES/NO responses over R passes
    yes_count = 0

    # Perform R stochastic forward passes
    for _ in range(R_STOCHASTIC_SAMPLES):
        p_support_r, choice_r = llm_verify_claim(claim, evidence, temperature=TEMPERATURE)
        if choice_r == "YES":
            yes_count += 1
        time.sleep(DELAY_SECONDS) # Respect rate limits

    # p_support: Average confidence
    p_support_mean = yes_count / R_STOCHASTIC_SAMPLES

    # H_semantic: Simple measure of consistency (1 - normalized variance)
    # If all R samples agree, consistency is 1, entropy is low.
    # We approximate consistency as the frequency of the majority vote.
    consistency = max(p_support_mean, 1.0 - p_support_mean)

    # H_semantic is low when consistency is high. We use 1 - consistency as a proxy for uncertainty.
    h_semantic_proxy = 1.0 - consistency

    return p_support_mean, h_semantic_proxy


# ====================================================================
# PHASE 3: RAG-MMR AND RANK FUSION (RRF) LOGIC
# ====================================================================

def mmr_rerank(query_vec, candidates_vectors, k_rag: int) -> List[int]:
    """Reranks candidate indices using Maximal Marginal Relevance (MMR)."""

    # Use global cosine similarity function (assuming it's defined or imported)
    similarity = lambda a, b: cosine_similarity([a], [b])[0][0]

    # Convert numpy array of vectors to list of vectors
    candidates_list = [v.reshape(1, -1) for v in candidates_vectors]

    if not candidates_list:
        return []

    selected_indices = []

    for _ in range(min(k_rag, len(candidates_list))):
        best_mmr_score = -np.inf
        best_candidate_index = -1
        for idx, d_i_vec in enumerate(candidates_list):
            if idx in selected_indices:
                continue
            # Calculate Relevance: Sim(d_i, q)

            rel_score = similarity(d_i_vec, query_vec)

            # Calculate Diversity: max(Sim(d_i, d_j)) for already selected j

            div_score = 0.0
            if selected_indices:
                selected_vectors = [candidates_vectors[j] for j in selected_indices]
                # Calculate similarity of current candidate to all selected vectors
                sim_to_selected = [similarity(d_i_vec, s_vec) for s_vec in selected_vectors]
                div_score = max(sim_to_selected)

            # MMR Formula: lambda * Relevance - (1 - lambda) * Diversity
            mmr_score = (LAMBDA_MMR * rel_score) - ((1 - LAMBDA_MMR) * div_score)

            if mmr_score > best_mmr_score:
                best_mmr_score = mmr_score
                best_candidate_index = idx

        if best_candidate_index != -1:
            selected_indices.append(best_candidate_index)

    # Return the indices of the selected candidates in the pool
    return selected_indices


def reciprocal_rank_fusion(all_ranked_lists: List[List[int]], k_final: int) -> List[int]:
    """Applies RRF to combine multiple ranked lists into a single consensus list."""
    RRF_scores = {}
    for ranked_list in all_ranked_lists:
        for rank, doc_index in enumerate(ranked_list, start=1):
            score = 1.0 / (KAPPA_RRF + rank)
            RRF_scores[doc_index] = RRF_scores.get(doc_index, 0.0) + score

    # Sort by RRF Score and select top k_final
    sorted_scores = sorted(RRF_scores.items(), key=lambda item: item[1], reverse=True)

    return [doc_index for doc_index, score in sorted_scores][:k_final]


# ====================================================================
# PHASE 4: MAIN PIPELINE EXECUTION
# ====================================================================

def run_causal_verification_pipeline(candidate_set: Set[Tuple[int, int]], N_map: Dict[str, str], sentences: List[str]) -> Dict[Tuple[int, int], Dict[str, Any]]:

    """
    Executes the full hypothesis verification and RAG-MMR pipeline for all candidate pairs.
    """

    # --- Setup ---
    # Convert node indices (0-14) back to N-map keys for naming purposes
    idx_to_node_name = {idx: name for idx, name in enumerate(sorted(N_map.keys(), key=lambda x: int(x[1:]) if x != 'N0' else 0))}

    # 1. Prepare Document Index
    idx_doc, VD = setup_document_index(sentences)
    final_output_map: Dict[Tuple[int, int], Dict[str, Any]] = {}

    # --- Main Loop: Iterate through each plausible link (i, j) ---
    for i, j in candidate_set:
        # Skip pairs involving the N0 placeholder (index 0)
        if i == 0 or j == 0:
            continue

        node_i_name = N_map.get(idx_to_node_name.get(i, f'N{i}'), f'Node {i}')
        node_j_name = N_map.get(idx_to_node_name.get(j, f'N{j}'), f'Node {j}')

        print(f"\n--- Processing Pair: {node_i_name} ({i}) -> {node_j_name} ({j}) ---")

        # H_verified storage for this pair
        H_verified_list: List[str] = []
        all_ranked_lists_for_pair: List[List[int]] = []

        # --- 1. Hypothesis Generation ---
        hypotheses = llm_generate_hypotheses(node_i_name, node_j_name)
        if not hypotheses:
            print("Skipped: No hypotheses generated.")
            continue

        # --- 2. Verification and Dual Filtering ---
        for hl in hypotheses:
            # 2a. Embed hypothesis and retrieve evidence
            v_hl = embedder.encode([hl])[0].reshape(1, -1)
            D_rag, I_rag = idx_doc.search(v_hl, K_RAG)
            verification_snippets = " ".join([sentences[idx] for idx in I_rag[0]])

            # 2b. Estimate support and entropy (via R stochastic passes)
            p_support, h_semantic = estimate_semantic_entropy(hl, verification_snippets)

            # 2c. Dual Filtering: Check thresholds
            if p_support > TAU_SUPPORT and h_semantic < TAU_ENTROPY:
                H_verified_list.append(hl)

                # --- 3. Adaptive Pooling and MMR (For verified hypotheses only) ---
                score_h = p_support # Use p_support as the score for adaptive pooling

                # Adaptive Pool Size: kpool = kbase + (1 - score) * kexpansion
                k_pool = int(K_BASE + (1.0 - score_h) * K_EXPANSION)
                k_pool = max(K_RAG, k_pool) # Ensure pool is at least k_RAG

                # Retrieve large pool
                D_pool, I_pool = idx_doc.search(v_hl, k_pool)
                pool_vectors = VD[I_pool[0]]

                # MMR Reranking
                final_indices_h = mmr_rerank(v_hl, pool_vectors, k_rag=K_RAG)

                # Store indices (mapped back to global sentence IDs)
                global_indices_h = [I_pool[0][idx] for idx in final_indices_h]
                all_ranked_lists_for_pair.append(global_indices_h)

            print(f"  Claim Verified: {p_support:.2f}/{TAU_SUPPORT:.2f} (Support) | {h_semantic:.2f}/{TAU_ENTROPY:.2f} (Entropy) -> {'KEPT' if hl in H_verified_list else 'REJECTED'}")

        # --- 4. Final Rank Fusion (RRF) ---
        if all_ranked_lists_for_pair:
            final_indices_total = reciprocal_rank_fusion(all_ranked_lists_for_pair, k_final=3) # Select top 3 snippets
            evidence_text = "\n".join([sentences[idx] for idx in final_indices_total])
        else:
            final_indices_total = []
            evidence_text = "No strong, stable evidence found."

        # --- 5. Final Output Format ---
        final_output_map[(i, j)] = {
            "verified_causal_hypothesis": H_verified_list,
            "evidence_text": evidence_text
        }

        time.sleep(DELAY_SECONDS) # Respect rate limits

    return final_output_map


In [None]:
import numpy as np
import torch
from typing import List, Dict, Tuple, Any, Set
# Assuming all function definitions (setup_document_index, run_causal_verification_pipeline, etc.) are available in the environment.

# --- Helper Function (Required for MMR Reranking) ---
# MMR calculation requires cosine similarity, which we define here.
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
# We use a wrapper function for consistency, as the previous logic relied on it
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    # Handles 1-dimensional array input (vector)
    return sklearn_cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]


# 3. Candidate Set C1 (Simulated output from GAE/ANN process)
# Using a small, representative sample of plausible pairs (0-indexed)
C1: Set[Tuple[int, int]] = {
    (4, 13), (13, 11), (12, 8), (12, 10), (10, 14), (3, 2), (12, 13), (6, 3), (2, 3), (1, 10)
}

# 4. Latent Embeddings (Simulated output from GAE)
# This would be the Final_Embeddings_Z tensor, converted to NumPy.
# D_LATENT = 64, N_NODES = 15.
Z_np_sim = np.random.rand(15, 64).astype('float32')

In [None]:
# 1. SETUP: Embeddings and ANN Index
# Get the Document Vector Store (VD) and the FAISS Index (idx_doc)
# This needs to run once.
idx_doc, VD = setup_document_index(sentences_list)


# 2. MAIN EXECUTION: Run the Full Verification Pipeline
# The pipeline integrates the LLM calls, RAG, Entropy, and RRF.
final_causal_map = run_causal_verification_pipeline(
    candidate_set=C1, 
    N_map=N, 
    sentences=sentences_list
)

# 3. FINAL OUTPUT
print("\n" + "="*50)
print("FINAL CAUSAL VERIFICATION RESULTS")
print("="*50)
# Use pprint for clean output of the final JSON-like structure
import pprint
pprint.pprint(final_causal_map)

### Important notice
Now we are done till a decent level (Im skipping the part where we generate K2 pairs from K1 pairs as of now, lets take that later on ) As of now i'll save important things and move on to CoCAD dp and CausGT=HS model.

Hence as of now K1 plausible candidate pairs are only my Eprior