In [None]:
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage

import spacy

import pandas as pd
import json
import itertools
import re

import networkx as nx
from rapidfuzz import fuzz
from collections import defaultdict, deque

from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
with open("example_input.json", "r") as f:
    example_input = json.load(f)

In [None]:
def table_to_df(data_dict):
    records = []
    for row in data_dict["data"]:
        record = {}
        for i, (header, _) in enumerate(data_dict["header"]):
            text = row[i][0].strip()
            link = row[i][1][0] if row[i][1] else None
            record[header] = text
            record[f"{header}_link"] = link
        records.append(record)

    return pd.DataFrame(records)

table_df = table_to_df(example_input["table"])
table_df.head()

In [None]:
def prompt_ent_extr_from_q(question, table_id, table_headers):
    return f"""Agent Introduction: You are an agent who is going to be assisting me in a question answering
    task. For this task, I need to first identify the named entities in the question.
    Task: Identify the named entities in the provided question. These entities will serve as key elements
    for extracting pertinent information from the available sources, which include table name and its
    headers.
    Output format:
    Entities: [‘<entity1>’, ‘<entity2>’, .....]
    Use the below example to better understand the task
    Input:
    Question: What was the nickname of the gold medal winner in the men’s heavyweight Greco-
    Roman wrestling event of the 1932 Summer Olympics?
    Table Name: Sweden at the 1932 Summer Olympics
    Table Headers: ["Medal", "Name", "Sport", "Event"]
    Output:
    Entities: [‘nickname’, ‘medal’, ‘gold’, ‘men’s heavyweight’,
    ‘Greco-Roman Wrestling event’, ‘1932 Summer Olympics’]
    Input:
    Question: {question}
    Table Name: {table_id}
    Table Headers: {table_headers}
    Output:"""

In [None]:
question = example_input["question"]
table_id = example_input["table"]["title"]
table_headers = str([i[0] for i in example_input["table"]["header"]])

Попробовать запустить без рассуждений

In [None]:
llm = ChatOllama(
    model="qwen3:8b", 
    temperature=0,   
)

In [None]:
def remove_think_blocks(text):
    return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()

In [None]:
entities = remove_think_blocks(llm.invoke([HumanMessage(content=prompt_ent_extr_from_q(question, table_id, table_headers))]).content)

In [None]:
def prompt_relevant_header(question, table_id, table_headers, entities):
    return f"""Agent Introduction: You are an agent who is going to be assisting me in a question answering
    task. I have a table as a source of information. I have already extracted the relevant entities from the
    question. For this task, I need to first identify the column headers that are relevant in the question.
    Task: Identify the relevant column headers from the provided list, based on the extracted entities
    from the question. I will also provide the extracted entities from the question and name of the table.
    Output format:
    Relevant headers: [‘<header-1>’, ‘<header-2>’, ....]
    Use the below example to better understand the task
    Input:
    Question: What was the nickname of the gold medal winner in the men’s heavyweight Greco-
    Roman wrestling event of the 1932 Summer Olympics?
    Table Name: Sweden at the 1932 Summer Olympics
    Table Headers: ["Medal", "Name", "Sport", "Event"]
    Entities extracted from question: ["gold medal", "men’s heavyweight", "Greco-Roman Wrestling",
    "1932 Summer Olympics"]
    Output:
    Relevant headers: ["Medal", "Name", "Sport", "Event"]
    Input:
    Question: {question}
    Table Name: {table_id}
    Table Headers: {table_headers}
    Entities extracted from question: {entities}
    Output:"""

In [None]:
relevant_headers = remove_think_blocks(llm.invoke([HumanMessage(content=prompt_relevant_header(question, table_id, table_headers, entities))]).content)

In [None]:
def prompt_entity_header_mapping(question, table_id, entities, relevant_headers):
    return f"""Agent Introduction: You are an agent who is going to be assisting me in a question answering
    task. I have a table as a source of information. I have already extracted relevant entities from the
    question and relevant column headers from the table.
    Task: Map the entities extracted from the question with the relevant headers and the table name.
    Output format:
    "<entity1>": ["<mapping1>", "<mapping2>"],
    "<entity2>": ["<mapping1>"]
    For each entity extracted from the question, there should be a corresponding <mapping> to an item
    in the ‘Relevant headers’ column. If none of the headers match the entity, the mapping should be
    labeled as "Others".
    Use the below example to better understand the task
    Input:
    Question: What was the nickname of the gold medal winner in the men’s heavyweight Greco-
    Roman wrestling event of the 1932 Summer Olympics?
    Table Name: Sweden at the 1932 Summer Olympics
    Entities extracted from question: ["gold medal", "men’s heavyweight", "Greco-Roman Wrestling",
    "1932 Summer Olympics"]
    Relevant headers: ["Medal", "Name", "Sport", "Event"]
    Output:
    "gold medal": ["Medal"],
    "men’s heavyweight": ["Event"],
    "Greco-Roman Wrestling": ["Sport"],
    "1932 Summer Olympics": ["Others"]
    Input:
    Question: {question}
    Table Name: {table_id}
    Entities extracted from question: {entities}
    Relevant Headers: {relevant_headers}
    Output:"""

In [None]:
entity_headers_match = remove_think_blocks(llm.invoke([HumanMessage(content=prompt_entity_header_mapping(question, table_id, entities, relevant_headers))]).content)

In [None]:
entities, relevant_headers, entity_headers_match

In [None]:
relevant_headers = relevant_headers[relevant_headers.index("[")+1:relevant_headers.index("]")]

In [None]:
relevant_headers_list = [i.removesuffix(", ").strip().strip("\"") for i in relevant_headers.split(",")]

In [None]:
subtable_df = table_df[relevant_headers_list + [i + "_link" for i in relevant_headers_list]]

In [None]:
def extract_entities_by_doc(
    passages,
    model="en_core_web_trf",
    keep_duplicates=True,
):
    nlp = spacy.load(model)

    out = {}
    for doc_id, text in passages.items():
        doc = nlp(text or "")
        ents = []
        for e in doc.ents:
            ents.append(e.text.strip())
        if not keep_duplicates:
            seen, uniq = set(), []
            for x in ents:
                if x not in seen:
                    seen.add(x)
                    uniq.append(x)
            ents = uniq
        out[doc_id] = ents
    return out

In [None]:
doc_entities = extract_entities_by_doc(example_input["links"])

In [None]:
def build_hybrid_graph(
    table_df,
    doc_entities,
    headers,
):
    G = nx.Graph()

    for col in headers:
        G.add_node(("header", col), kind="header", col=col)

    n_rows = len(table_df)
    link_suffix = "_link"
    for r in range(n_rows):
        for col in headers:
            raw = table_df.loc[r, col] if col in table_df.columns else ""
            text = "" if pd.isna(raw) else str(raw)

            link_col = f"{col}{link_suffix}"
            link_val = table_df.loc[r, link_col] if link_col in table_df.columns else None
            link = None if (link_val is None or (isinstance(link_val, float) and pd.isna(link_val)) or str(link_val).strip()=="") else str(link_val).strip()

            cell = ("cell", r, col)
            G.add_node(cell, kind="cell", row=r, col=col, text=text, link=link)
            G.add_edge(cell, ("header", col), kind="cell-header")

    for r in range(n_rows):
        row_cells = [("cell", r, col) for col in headers]
        for u, v in itertools.combinations(row_cells, 2):
            if G.has_node(u) and G.has_node(v):
                G.add_edge(u, v, kind="row")

    def add_ent(ent_text: str):
        ent_text = str(ent_text).strip()
        if not ent_text:
            return None
        node = ("ent", ent_text)
        if not G.has_node(node):
            G.add_node(node, kind="ent", text=ent_text)
        return node

    for node, attr in list(G.nodes(data=True)):
        if attr.get("kind") != "cell":
            continue
        link = attr.get("link")
        if not link:
            continue

        ents = doc_entities.get(link, [])
        if not ents:
            continue

        for ent in ents:
            ent_node = add_ent(ent)
            if ent_node:
                G.add_edge(node, ent_node, kind="cell-ent")

    return G


In [None]:
hybrid_graph = build_hybrid_graph(subtable_df, doc_entities, relevant_headers_list)

In [None]:
entity_headers_match

In [None]:
entity_headers_match_stripped = [i.split(":") for i in entity_headers_match.split(",\n")]
for i in range(len(entity_headers_match_stripped)):
    for j in range(2):
        entity_headers_match_stripped[i][j] = entity_headers_match_stripped[i][j].strip("\"").strip().strip("[").strip("]").strip("\"")

entity_headers_dict = {}

for i in entity_headers_match_stripped:
    entity_headers_dict[i[0]] = i[1]

entity_headers_dict

In [None]:
_instructor_model = None

def get_instructor_model():
    global _instructor_model
    if _instructor_model is None:
        _instructor_model = SentenceTransformer('hkunlp/instructor-large')
    return _instructor_model

def semantic_match(query: str, candidates: list, threshold: float = 0.8):
    if not candidates:
        return []

    model = get_instructor_model()
    device = model.device

    instruction = "Represent the question entity for matching with table cells or document entities"

    with torch.no_grad():
        query_emb = model.encode(
            [[instruction, query]],
            convert_to_tensor=True,
            device=device
        )

        candidate_embs = model.encode(
            [[instruction, cand] for cand in candidates],
            convert_to_tensor=True,
            device=device
        )

        similarities = util.cos_sim(query_emb, candidate_embs)[0]

    matches = []
    for i, sim in enumerate(similarities):
        if sim >= threshold:
            matches.append((candidates[i], float(sim)))
    
    del query_emb, candidate_embs, similarities
    torch.cuda.empty_cache()
    
    return matches

In [None]:
def format_node_for_output(G, node, headers):
    attr = G.nodes[node]
    kind = attr.get("kind")
    if kind == "cell":
        text = str(attr.get("text", "")).strip()
        col = attr.get("col")
        if col in headers and text:
            return f"{text}; {col}"
    elif kind == "ent":
        text = attr.get("text", "").strip()
        if text:
            return text
    return None

In [None]:
def prune_and_traverse_hybrid_graph(G, entity_header_mapping, headers, threshold=0.8):
    entity_total_list = []
    entity_to_node = {}
    for node, attr in G.nodes(data=True):
        if attr.get("kind") == "ent":
            text = attr.get("text", "").strip()
            if text:
                entity_total_list.append(text)
                entity_to_node[text] = node

    def top1_semantic_match(query, candidates, threshold=0.8):
        if not candidates:
            return None
        matches = semantic_match(query, candidates, threshold=threshold)
        if not matches:
            return None
        matches = sorted(matches, key=lambda x: x[1], reverse=True)
        cand, score = matches[0]
        return (cand, score) if score >= threshold else None

    start_nodes = set()

    for question_entity, mapped_header in entity_header_mapping.items():
        q = str(question_entity).strip()
        if not q:
            continue

        if mapped_header == "Others":
            tm = top1_semantic_match(q, entity_total_list, threshold)
            if tm:
                ent_text, _ = tm
                node = entity_to_node.get(ent_text)
                if node and G.has_node(node):
                    start_nodes.add(node)

        else:
            if mapped_header not in headers:
                continue

            cell_texts, cell_nodes = [], []
            for node, attr in G.nodes(data=True):
                if attr.get("kind") == "cell" and attr.get("col") == mapped_header:
                    text = str(attr.get("text", "")).strip()
                    if text:
                        cell_texts.append(text)
                        cell_nodes.append(node)

            tm = top1_semantic_match(q, cell_texts, threshold)
            if tm:
                best_text, _ = tm
                try:
                    idx = cell_texts.index(best_text)
                    start_nodes.add(cell_nodes[idx])
                except ValueError:
                    pass
            else:
                all_texts, all_nodes = [], []
                for node, attr in G.nodes(data=True):
                    if attr.get("kind") == "cell":
                        t = str(attr.get("text", "")).strip()
                        if t:
                            all_texts.append(t)
                            all_nodes.append(node)
                tm2 = top1_semantic_match(q, all_texts, threshold)
                if tm2:
                    best_text, _ = tm2
                    try:
                        idx = all_texts.index(best_text)
                        start_nodes.add(all_nodes[idx])
                    except ValueError:
                        pass

    if not start_nodes:
        return None
    print(start_nodes)
    hop_dict = defaultdict(list)
    visited = set(start_nodes)
    queue = deque([(node, 0) for node in start_nodes])

    while queue:
        node, hop = queue.popleft()
        if hop >= 3:
            continue

        next_hop = hop + 1
        for neighbor in G.neighbors(node):
            if neighbor in visited:
                continue

            visited.add(neighbor)
            queue.append((neighbor, next_hop))

            formatted = format_node_for_output(G, neighbor, headers)
            if formatted:
                hop_dict[f"{next_hop}-hop"].append(formatted)

    return {
        "1-hop": hop_dict["1-hop"],
        "2-hop": hop_dict["2-hop"],
        "3-hop": hop_dict["3-hop"]
    }

In [None]:
original_subheaders = [i for i in list(table_df.columns) if not i.endswith("link")]

In [None]:
hopwise_context = prune_and_traverse_hybrid_graph(
    hybrid_graph,
    entity_headers_dict,
    headers=original_subheaders
)

In [None]:
len(hopwise_context["3-hop"])

In [None]:
def extract_subtable_from_hop(hop_list, table_df):
    mentioned_cols = set()
    cell_mentions = []

    for item in hop_list:
        if "; " in item:
            value_part, col_part = item.rsplit("; ", 1)
            if col_part in table_df.columns:
                mentioned_cols.add(col_part)
                cell_mentions.append((value_part.strip(), col_part))

    if not mentioned_cols:
        return pd.DataFrame(columns=table_df.columns)

    cols_to_keep = set(mentioned_cols)
    for col in mentioned_cols:
        link_col = f"{col}_link"
        if link_col in table_df.columns:
            cols_to_keep.add(link_col)

    cols_to_keep = sorted(cols_to_keep, key=lambda x: list(table_df.columns).index(x))  # сохранить порядок

    mask = pd.Series([False] * len(table_df), index=table_df.index)
    for value, col in cell_mentions:
        col_series = table_df[col].astype(str).fillna("")
        mask |= (col_series == value)

    filtered_df = table_df.loc[mask, cols_to_keep].copy()
    return filtered_df

In [None]:
def llm_reader_prompt(table_data, passages, question):
    return f"""Agent Introduction: Hello! I’m your Hybrid-QA expert agent, here to assist you in answering
    complex questions by leveraging both table data and passage information. Let’s combine these
    sources to generate accurate and comprehensive answers!
    Task: Your task involves a central question that requires information from both a table and passages.
    Here’s the context you’ll need:
    Table Data: {table_data}
    Passages: {passages}
    Question: {question}
    Final Answer: Provide the final answer in the format below. If the answer cannot be answered
    with the given context, provide None.
    Final Answer Format:
    Final Answer: <your answer>
    If the final answer is "None", provide the names of passages that are relevant to the above questions.
    If no passages are relevant give ‘[]’ as Relevant Passages.
    Relevant Passages Format:
    Relevant Passages: [‘<name-of-passage1>’, ‘<name-of-passage2>’, ......]"""

In [None]:
def collect_links(df):
    link_columns = [col for col in df.columns if '_link' in col]

    all_links_series = pd.concat([df[col] for col in link_columns], ignore_index=True)
    all_links_cleaned = all_links_series.dropna()

    return all_links_cleaned.tolist()

In [None]:
hop1_table = extract_subtable_from_hop(hopwise_context["1-hop"], subtable_df)
hop1_table_md = hop1_table.to_markdown()
hop1_links = [example_input["links"][i] for i in collect_links(hop1_table)]

In [None]:
hop1_table

In [None]:
hop_1_invoke = remove_think_blocks(llm.invoke([HumanMessage(content=llm_reader_prompt(hop1_table_md, hop1_links, question))]).content)

In [None]:
hop_1_invoke

In [None]:
for i in range(2, 4):
    if "None" in hop_1_invoke:
        relevant_from_hop_1 = hop_1_invoke[hop_1_invoke.index("[")+1:hop_1_invoke.index("]")]
        hop2_table = pd.concat([hop1_table, extract_subtable_from_hop(hopwise_context[f"{i}-hop"], subtable_df)], ignore_index=True, join="inner").drop_duplicates()

        hop2_table_md = hop2_table.to_markdown()
        hop2_links = [example_input["links"][i] for i in collect_links(hop2_table)]

        if relevant_from_hop_1 != "":
            relevant_from_hop_1 = relevant_from_hop_1.split("\', ")
        hop_2_invoke = remove_think_blocks(llm.invoke([HumanMessage(content=llm_reader_prompt(hop2_table_md, hop2_links, question))]).content)
        hop_1_invoke = hop_2_invoke
    else:
        print(hop_1_invoke)
        break


if "None" in hop_2_invoke:
    relevant_from_hop_1 = hop_1_invoke[hop_1_invoke.index("[")+1:hop_1_invoke.index("]")]
    hop2_table = pd.concat([hop1_table, extract_subtable_from_hop(hopwise_context["2-hop"], subtable_df)], ignore_index=True)
    hop2_table_md = hop2_table.to_markdown()
    hop2_links = [example_input["links"][i] for i in collect_links(hop1_table)]

    if relevant_from_hop_1 != "":
        relevant_from_hop_1 = relevant_from_hop_1.split("\', ")
        hop2_links = hop2_links + relevant_from_hop_1

    hop_2_invoke = remove_think_blocks(llm.invoke([HumanMessage(content=llm_reader_prompt(subtable_df, hop2_links, question))]).content)

In [None]:
hop_1_invoke, hop_2_invoke

In [8]:
import os

In [10]:
name = "preprocessed_for_hybrid_graphs/test_step2_enriched.json"

In [17]:
f"outputs/{os.path.basename(name[:name.index(".")])}.json"

'outputs/test_step2_enriched.json'