In [2]:
import re
import os
import sys
import json
import langchain
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import tools_condition, ToolNode
from langgraph.graph import START, StateGraph, MessagesState
from langchain_core.messages import SystemMessage, HumanMessage
from schemas.mondial_federated_schema import MONDIAL_ECONOMY, MONDIAL_GEO, MONDIAL_POlITICS, MONDIAL_SOCIAL, MONDIAL_FULL_SCHEMA

load_dotenv()

# Leitura do arquivo PDF extraído como texto
with open("../schemas/mondial-RS.txt", "r") as file:
    content = file.read()

table_chunks = re.findall(r"---\n(.*?)\n---", content, flags=re.DOTALL)
table_chunks[:2]

['Economy: economical information about the countries.\ncountry: the country code\nGDP: gross domestic product (in million $)\nagriculture: percentage of agriculture of the GDP\nservice: percentage of services of the GDP\nindustry: percentage of industry of the GDP\ninflation: inflation rate (per annum)\nunemployment: unemployment rate',
 'Population: information about the population of the countries.\ncountry: the country code\npopulation growth: population growth rate (per annum)\ninfant mortality: infant mortality (per thousand)']

## Acima, temos o esquema relacional de todas as tabelas do Mondial.
## Vamos extrair as tabelas, e criar uma estrutura de dados structured_data que armazene as informações.

In [3]:
import re
import json

# Função para processar cada chunk e extrair informações
def process_chunks(table_chunks):
    tables = {}

    for chunk in table_chunks:
        # Separar o nome da tabela e descrição
        header_match = re.match(r"^(.*?): (.*?)\n", chunk, flags=re.DOTALL)
        if not header_match:
            continue
        table_name = header_match.group(1).strip()
        description = header_match.group(2).strip()
        
        # Extrair colunas e descrições
        columns = {}
        for line in chunk.split("\n")[1:]:  # Ignorar a primeira linha
            column_match = re.match(r"^(.*?): (.*)", line)
            if column_match:
                column_name = column_match.group(1).strip()
                column_description = column_match.group(2).strip()
                columns[column_name] = column_description
        
        # Estruturar os dados da tabela
        tables[table_name] = {
            "description": description,
            "columns": columns
        }
    
    return tables

# Processar os chunks
structured_data = process_chunks(table_chunks)

# Exibir o resultado
print(json.dumps(structured_data, indent=2, ensure_ascii=False))

{
  "Economy": {
    "description": "economical information about the countries.",
    "columns": {
      "country": "the country code",
      "GDP": "gross domestic product (in million $)",
      "agriculture": "percentage of agriculture of the GDP",
      "service": "percentage of services of the GDP",
      "industry": "percentage of industry of the GDP",
      "inflation": "inflation rate (per annum)",
      "unemployment": "unemployment rate"
    }
  },
  "Population": {
    "description": "information about the population of the countries.",
    "columns": {
      "country": "the country code",
      "population growth": "population growth rate (per annum)",
      "infant mortality": "infant mortality (per thousand)"
    }
  },
  "CountryLocalName": {
    "description": "information about the local name of the country.",
    "columns": {
      "country": "the country code",
      "localname": "the local name, usually in a local alphabet (UTF-8)"
    }
  },
  "Religion": {
    "de

In [4]:
from schemas.mondial_federated_schema import MONDIAL_ECONOMY, MONDIAL_GEO, MONDIAL_POlITICS, MONDIAL_SOCIAL, MONDIAL_FULL_SCHEMA

economy_db_tables_names = []
economy_db_tables = MONDIAL_ECONOMY.split("CREATE TABLE ")
for table in economy_db_tables:
    table_name = table.split("\n")[0]
    economy_db_tables_names.append(table_name)
    
economy_db_tables_names = economy_db_tables_names[1:]
print("\nTabelas do banco de dados de economia:", economy_db_tables_names)

geo_db_tables_names = []
geo_db_tables = MONDIAL_GEO.split("CREATE TABLE ")
for table in geo_db_tables:
    table_name = table.split("\n")[0]
    geo_db_tables_names.append(table_name)

geo_db_tables_names = geo_db_tables_names[1:]
print("\nTabelas do banco de dados de geografia:", geo_db_tables_names)

social_db_tables_names = []
social_db_tables = MONDIAL_SOCIAL.split("CREATE TABLE ")
for table in social_db_tables:
    table_name = table.split("\n")[0]
    social_db_tables_names.append(table_name)

social_db_tables_names = social_db_tables_names[1:]
print("\nTabelas do banco de dados social:", social_db_tables_names)

politics_db_tables_names = []
politics_db_tables = MONDIAL_POlITICS.split("CREATE TABLE ")
for table in politics_db_tables:
    table_name = table.split("\n")[0]
    politics_db_tables_names.append(table_name)
    
politics_db_tables_names = politics_db_tables_names[1:]
print("\nTabelas do banco de dados de politica:", politics_db_tables_names)


Tabelas do banco de dados de economia: ['Economy', 'Population']

Tabelas do banco de dados de geografia: ['Country', 'RiverThrough', 'encompasses', 'Continent', 'City', 'Province', 'Mountain', 'Desert', 'Island', 'Lake', 'Sea', 'River', 'Airport', 'geo_Mountain', 'geo_Desert', 'geo_Island', 'geo_River', 'geo_Sea', 'geo_Lake', 'geo_Source', 'geo_Estuary', 'located', 'locatedOn', 'islandIn', 'MountainOnIsland', 'LakeOnIsland', 'RiverOnIsland', 'mergesWith']

Tabelas do banco de dados social: ['Religion', 'EthnicGroup', 'Language', 'Sublanguage', 'Countrylocalname', 'Countryothername', 'Provincelocalname', 'Provinceothername', 'Citylocalname', 'Cityothername', 'Countrypops', 'Provpops', 'Citypops']

Tabelas do banco de dados de politica: ['Politics', 'borders', 'Organization', 'isMember']


## Agora que temos os dados de quais tabelas pertencem à qual banco de dados da federação, e quais tabelas pertencem aos bancos de dados, vamos criar um prompt para criar as conexões SameAsTable, intrabancos.

In [5]:
prompt = """
You are an intelligent assistant responsible for identifying `SameAsTable` relationships between tables in different federated databases. Your task is to determine columns in the input table that represent equivalent data or have the same logical meaning as columns in tables from other databases.

!!! Attention !!!
1. Respond **only in JSON format**, with no additional text or explanation.
2. Identify **only SameAsTable relationships**.
3. Prioritize connections between the input table and tables in other databases. Avoid connections within the same database.

Example:
The `Economy` table contains economic information. A relevant `SameAsTable` connection might be:
{{
    "Endpoint Source": "MONDIAL_ECONOMY",
    "Class Source": "Economy",
    "Properties Source": "country",
    "Endpoint Destination": "MONDIAL_POLITICS",
    "Class Destination": "Politics",
    "Properties Destination": "country"
}}

Now, for the table `{table_name}`:
Description: {table_description}
Columns and descriptions:
{table_columns_info}

Schemas of federated databases:
MONDIAL_SOCIAL DATABASE SCHEMA:
{social_db_schema}

MONDIAL_GEO DATABASE SCHEMA:
{geo_db_schema}

MONDIAL_POLITICS DATABASE SCHEMA:
{politics_db_schema}

MONDIAL_ECONOMY DATABASE SCHEMA:
{economy_db_schema}

Question:
Identify all `SameAsTable` relationships for the table `{table_name}`. Return the relationships **only in JSON format**.
"""

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_utils import get_llm

llm = get_llm()

results = []

for table_name in structured_data:
    table_info = structured_data[table_name]
    
    table_description = table_info["description"]
    table_columns_info = "\n".join([f"{column}: {description}" for column, description in table_info["columns"].items()])
    
    social_db_schema = MONDIAL_SOCIAL
    geo_db_schema = MONDIAL_GEO
    politics_db_schema = MONDIAL_POlITICS
    economy_db_schema = MONDIAL_ECONOMY
    
    prompt_with_schema = prompt.format(
        table_name=table_name,
        table_description=table_description,
        table_columns_info=table_columns_info,
        social_db_schema=social_db_schema,
        geo_db_schema=geo_db_schema,
        politics_db_schema=politics_db_schema,
        economy_db_schema=economy_db_schema
    )
    
    human_msg = HumanMessage(
        content=prompt_with_schema
    )
    
    chain = llm | StrOutputParser()
    
    result = chain.invoke([human_msg])
    
    results.append(result)
    
    print(result)
    print(f"Just processed table {table_name}.")
    break

```json
[
    {
        "Endpoint Source": "MONDIAL_ECONOMY",
        "Class Source": "Economy",
        "Properties Source": "country",
        "Endpoint Destination": "MONDIAL_SOCIAL",
        "Class Destination": "Countrypops",
        "Properties Destination": "Country"
    },
    {
        "Endpoint Source": "MONDIAL_ECONOMY",
        "Class Source": "Economy",
        "Properties Source": "country",
        "Endpoint Destination": "MONDIAL_GEO",
        "Class Destination": "Country",
        "Properties Destination": "Code"
    },
    {
        "Endpoint Source": "MONDIAL_ECONOMY",
        "Class Source": "Economy",
        "Properties Source": "country",
        "Endpoint Destination": "MONDIAL_POLITICS",
        "Class Destination": "Politics",
        "Properties Destination": "Country"
    }
]
```
Just processed table Economy.


In [6]:
results

['```json\n[\n    {\n        "Endpoint Source": "MONDIAL_ECONOMY",\n        "Class Source": "Economy",\n        "Properties Source": "country",\n        "Endpoint Destination": "MONDIAL_SOCIAL",\n        "Class Destination": "Religion",\n        "Properties Destination": "Country"\n    },\n    {\n        "Endpoint Source": "MONDIAL_ECONOMY",\n        "Class Source": "Economy",\n        "Properties Source": "country",\n        "Endpoint Destination": "MONDIAL_SOCIAL",\n        "Class Destination": "EthnicGroup",\n        "Properties Destination": "Country"\n    },\n    {\n        "Endpoint Source": "MONDIAL_ECONOMY",\n        "Class Source": "Economy",\n        "Properties Source": "country",\n        "Endpoint Destination": "MONDIAL_SOCIAL",\n        "Class Destination": "Language",\n        "Properties Destination": "Country"\n    },\n    {\n        "Endpoint Source": "MONDIAL_ECONOMY",\n        "Class Source": "Economy",\n        "Properties Source": "country",\n        "Endpoint Des

## Salva as relações `SameAsTable` em um arquivo JSON, apenas as uniques, evitando documentos duplicados.

In [7]:
import json

# Função para extrair JSON de uma string de resultado
def extract_json_from_result(result):
    start = result.find("[")
    end = result.rfind("]") + 1
    if start != -1 and end != -1:
        return json.loads(result[start:end])
    return []

# Consolidar resultados únicos
consolidated_results = []
seen = set()

for result in results:
    extracted_json = extract_json_from_result(result)
    for entry in extracted_json:
        entry_tuple = tuple(entry.items())
        if entry_tuple not in seen:
            seen.add(entry_tuple)
            consolidated_results.append(entry)

# Salvar resultados consolidados em um arquivo JSON
with open("consolidated_sameastable.json", "w") as json_file:
    json.dump(consolidated_results, json_file, indent=2, ensure_ascii=False)

print("Resultados consolidados salvos em 'consolidated_sameastable.json'.")

Resultados consolidados salvos em 'consolidated_sameastable.json'.
