In [1]:
%cd ..

/Users/mitoura/Desktop/Repositories/rag--projects


In [2]:
from src.services import AzureOpenaiService
from src.utils import Settings

sets = Settings()
azure_openai_service = AzureOpenaiService(sets)
llm = azure_openai_service.get_llm()
embeddings = azure_openai_service.get_embeddings()

2025-08-17 10:52:28 - src.utils.settings - INFO - .env founded in: /Users/mitoura/Desktop/Repositories/rag--projects/.env
2025-08-17 10:52:29 - src.utils.settings - INFO - azure_openai_api_key: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - azure_openai_endpoint: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - llm_deployment_model: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - embedding_deployment_model: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - llm_api_version: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - embedding_api_version: OK
2025-08-17 10:52:29 - src.utils.settings - INFO - groq_api_key: OK


In [3]:
# install faiss cpu
%pip install -qU faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np

corpus = [
    "O gato está dormindo no sofá.",
    "O cachorro está latindo no quintal.",
    "A Lua gira ao redor da Terra.",
    "Estou aprendendo Python para ciência de dados.",
    "O céu está limpo hoje.",
    "Machine learning é uma área fascinante.",
    "Hoje é um bom dia para correr no parque."
] # examples texts in portuguese

In [5]:
corpus_embeddings = np.array(embeddings.embed_documents(corpus)).astype("float32")

In [6]:
# example creating a FAISS index

import faiss

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings) # add corpus embeddings to the index

In [7]:
# search for similar texts

query = "Estou aprendendo inteligência artificial"
query_embedding = embeddings.embed_query(query)
query_embedding = np.array([query_embedding]).astype("float32")
k = 3  # top-k
distances, indices = index.search(query_embedding, k)

In [8]:
for i in range(k):
    print(f"{i+1}. {corpus[indices[0][i]]} (distância: {distances[0][i]:.4f})")

1. Estou aprendendo Python para ciência de dados. (distância: 0.1834)
2. Machine learning é uma área fascinante. (distância: 0.2288)
3. O gato está dormindo no sofá. (distância: 0.4105)


In [9]:
# example to generate a local vdb with faiss

from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS


texts = [
    "O gato está dormindo no sofá.",
    "O cachorro está latindo no quintal.",
    "A Lua gira ao redor da Terra.",
    "Estou aprendendo Python para ciência de dados.",
    "O céu está limpo hoje.",
    "Machine learning é uma área fascinante.",
    "Hoje é um bom dia para correr no parque."
] # examples texts in portuguese

docs = [Document(page_content=t) for t in texts]

# crate a FAISS index in memory
vdb = FAISS.from_documents(docs, embeddings)
vdb.save_local("faiss_index")

In [10]:
# loag vdb from local storage
vdb = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [11]:
# search
query = "Estou aprendendo inteligência artificial"
resultados = vdb.similarity_search(query, k=3)

for i, r in enumerate(resultados):
    print(f"{i+1}. {r.page_content}")

1. Estou aprendendo Python para ciência de dados.
2. Machine learning é uma área fascinante.
3. O gato está dormindo no sofá.


### Using FAISS Service

```
rag--projects
├─ projects
│  └─ rag_faiss
│       └─ rag_service.py
```

In [None]:
from src.services.faiss import FaissService

# 0. load faiss services

faiss_service = FaissService(embeddings = embeddings,
                             chunk_size = 1200,
                             chunk_overlap = 500)

2025-08-17 10:52:35 - projects.rag_faiss.faiss - INFO - FaissService initialized with chunk_size=1200, chunk_overlap=500


In [None]:
# 1. Extract text from documents

from src.utils.extractors import DocumentExtractor

extractor = DocumentExtractor()
docs_raw = extractor.extract_documents(path="documents") # add here your folder path

2025-08-17 10:52:37 - projects.rag_faiss.extractors - INFO - Processing directory: documents
2025-08-17 10:52:37 - projects.rag_faiss.extractors - INFO - Found 1 PDF files
2025-08-17 10:52:37 - projects.rag_faiss.extractors - INFO - Extraction completed: 1 documents processed


In [5]:
from langchain.docstore.document import Document

# 2. converting to Document format
documents = [
    Document(
        page_content=doc["texts"],
        metadata={"source": doc["document"]}
    )
    for doc in docs_raw.values()
]

In [6]:
# 3. creating a local vector database
vdb = faiss_service.create_local_database(
    documents=documents,
    index_path="faiss_index/my_index"
)

2025-08-17 10:52:43 - projects.rag_faiss.faiss - INFO - Starting local database creation at: faiss_index/my_index
2025-08-17 10:52:46 - projects.rag_faiss.faiss - INFO - Database created with 93 chunks and saved at: faiss_index/my_index


In [7]:
# temporary

vdb = faiss_service.create_temporary_database(
    documents=documents
)

2025-08-17 10:53:04 - projects.rag_faiss.faiss - INFO - Starting temporary database creation
2025-08-17 10:53:06 - projects.rag_faiss.faiss - INFO - Temporary database created with 93 chunks. ID: 1b0e53dc-2408-4e13-b68d-3f9c4fe89f15


In [19]:
query = "What is thermodynamic equilibrium?"
results = faiss_service.similarity_search(vdb, query, k=3)

for r in results:
    print(r.page_content)

2025-08-16 23:12:15 - projects.rag_faiss.rag_service - INFO - Search completed: 3 results


Eng 2025, 6, x FOR PEER REVIEW 
2 of 23 
 
 
valuable insights into process behavior [2]. This ability to predict process performance 
42
 
prior to physical construction or operational changes drastically reduces costs, accelerates 
43
 
innovation, and mitigates associated risks [3,4]. 
44
 
In this context, modeling thermodynamic equilibrium is crucial, and the develop-
45
 
ment of accessible and accurate computational tools is a field in constant development. 
46
 
The effectiveness of this approach depends directly on the availability and capability of 
47
 
the software used, which has become a diverse ecosystem of commercial and open-source 
48
 
solutions.  
49
 
 
50
 
Current status of thermodynamic equilibrium software 
51
 
The landscape of thermodynamic equilibrium software has evolved significantly 
52
 
over the past decades, with various commercial and open-source solutions becoming avail-
53
 
able to researchers and industry professionals. Traditional commercial soft

In [20]:
# 6. to verify vdb info
info = faiss_service.get_database_info(vdb)
print(info)

{'type': 'Local/Loaded', 'id': 'N/A', 'total_vectors': 93, 'dimension': 1536, 'embedding_model': 'AzureOpenAIEmbeddings'}


In [24]:
# 7. using the context from vdb to generate responses
query = "What is tes-thermo?"
results = faiss_service.similarity_search(vdb, query, k=3)
prompt = f" This is the user question: {query}. Use this as context to generate a response: {results}"
res = llm.invoke(prompt).content

2025-08-16 23:15:46 - projects.rag_faiss.rag_service - INFO - Search completed: 3 results


In [25]:
print(res)

Tes-thermo is an open-source Python library designed for modeling and analyzing reactive chemical systems under combined chemical and phase equilibrium conditions. It uses a thermodynamic approach based on Gibbs energy minimization to calculate equilibrium compositions in isothermal reactors. The library supports both ideal and non-ideal gas phase behavior, employing methods such as the Virial equation (truncated at the second term) and various equations of state, including the Peng–Robinson equation, for non-ideality calculations. Tes-thermo also allows the use of different polynomial models for heat capacity calculations.

Developed to provide a highly predictive and robust tool, tes-thermo facilitates the simulation, analysis, and optimization of complex reactive processes. It has been applied successfully to processes like supercritical water gasification (SCWG) of methanol and ethanol, demonstrating its reliability in thermodynamic equilibrium studies. The library is available on 