## Load data

In [1]:
import langchain
from langchain_community.document_loaders import Docx2txtLoader
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_community.llms import Ollama
import os

In [2]:
loader = Docx2txtLoader("documents/Abreviaciones 1.docx")
data = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [4]:
list(map(lambda x: len(x.page_content), docs))


[892, 893, 956, 983, 947, 976, 733, 860]

In [18]:
ollama_emb = OllamaEmbeddings(
    model="phi3",
    base_url="http://host.docker.internal:11434"
)

vector_index_ollama = FAISS.from_documents(docs, ollama_emb)

In [19]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vector_index_ollama, f)

In [20]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [21]:
vectorIndex

<langchain_community.vectorstores.faiss.FAISS at 0x7f4de13206b0>

## Retrieve  similar embeddings for a given question

In [23]:
llm = Ollama(
    model="phi3",
    base_url="http://host.docker.internal:11434"
)

In [24]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [32]:
query = "Meaning of FDP. Only the meaning of FDP is required."

langchain.debug=True

result = chain({"question": query, "top_k": 3}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Meaning of FDP. Only the meaning of FDP is required.",
  "top_k": 3
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "VAN\n\nValue Added Network\n\nVSP\n\nVariable System Parameter\n\nWPU\n\nWorking Position Unit. The physical unit that contains the CWP, the RDU access, etc…\n\n\n\nAbreviaciones de PANSA:\n\n\n\n\n\nAbreviaciones de NATS:\n\n\n\n\n\n\n\nAbreviaciones de ICAS:\n\nAcronym\n\nMeaning\n\nAFTN\n\nAeronautical Fixed Telecommunication Network\n\nAGDL\n\nAir Ground Data Link\n\nAMLS\n\n(Very Advanced) ATC – Message Loggin System\n\nCMD\n\nControl and M

In [33]:
result["answer"]

'FDP stands for "Formulario de Planificación y Desarrollo Público," which translates to Public Planning and Development Form in English. However, based on the provided context from a document titled "Abreviaciones 1.docx" that relates to Situation Awareness Systems, there is no direct mention of FDP (Flight Data Processing). The acronym TCPV mentioned refers to Flight Control Treatment within the broader scope of the Public Planning and Development Form.'