## RAG for documentation of SHACL, RDF, and SPARQL

In [18]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf("./documentation_PDf/sparqlPDF.pdf")


In [19]:
def chunk_text(text, max_tokens=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word)
        if current_length <= max_tokens:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word)

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = chunk_text(pdf_text)


In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)


In [21]:
import faiss
import numpy as np

index = faiss.IndexFlatL2(embeddings[0].shape[0])
index.add(np.array(embeddings))


In [22]:
query = "What is sparql"
query_embedding = model.encode([query])
k = 5  # Number of chunks to retrieve
distances, indices = index.search(np.array(query_embedding), k)

retrieved_chunks = [chunks[i] for i in indices[0]]


In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

# Load PDF
loader = PyPDFLoader("./documentation_PDf/shaclPDF.pdf")
documents = loader.load()

# Split the text into chunks (default chunk size: 1000 characters)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize SentenceTransformers model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a FAISS vector store with the embeddings
vector_store = FAISS.from_documents(docs, embedding_model)


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")





In [12]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [14]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load T5-based model for sequence-to-sequence tasks
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create a text-to-text generation pipeline
llm_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)


# # Load a free LLM model (e.g., Falcon-7B-Instruct or similar small models)
# model_name = "google/flan-t5-small"  # Lightweight and open-source
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# # Create a pipeline for text generation
# llm_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Initialize LangChain LLM with the pipeline
llm = HuggingFacePipeline(pipeline=llm_pipeline)

# Create the RetrievalQA chain
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=llm_pipeline)


In [26]:

# Step 5: Query the system
query = "What does the PDF say about Validation Result (sh:ValidationResult)?"
response = qa_chain({"query": query})

# Print the result (answer)
print("Answer:")
print(response["result"])

# Print the source documents (optional)
print("\nSource Documents:")
for doc in response["source_documents"]:
    display(f"{doc.metadata.get('page', 'Unknown Page')}: {doc.page_content[:2000]}...\n")

Answer:
Validation results are interpreted based on the rules outlined in the section on SPARQ

Source Documents:


'7: Example validation results\n[ a sh:ValidationReport ;\n sh:conforms false ;\n sh:result\n [ a sh:ValidationResult ;\n  sh:resultSeverity sh:Violation ;\n  sh:focusNode ex:Alice ;\n  sh:resultPath ex:ssn ;\n  sh:value "987-65-432A" ;\n  sh:sourceConstraintComponent sh:RegexConstraintComponent ;\n  sh:sourceShape ... blank node _:b1 on ex:ssn above ... ;\n ] ,\n [ a sh:ValidationResult ;\n  sh:resultSeverity sh:Violation ;\n  sh:focusNode ex:Bob ;\n  sh:resultPath ex:ssn ;\n  sh:sourceConstraintComponent sh:MaxCountConstraintComponent ;\n  sh:sourceShape ... blank node _:b1 on ex:ssn above ... ;\n ] ,\n [ a sh:ValidationResult ;\n  sh:resultSeverity sh:Violation ;\n  sh:focusNode ex:Calvin ;\n  sh:resultPath ex:worksFor ;\n  sh:value ex:UntypedCompany ;\n  sh:sourceConstraintComponent sh:ClassConstraintComponent ;\n  sh:sourceShape ... blank node _:b2 on ex:worksFor above ... ;\n ] ,\n [ a sh:ValidationResult ;\n  sh:resultSeverity sh:Violation ;\n  sh:focusNode ex:Calvin ;\n  sh:res

'23: 3.6.1.2 \nResult (sh:result)\nFor every validation result that is produced by a \nvalidation\n process (except those mentioned in the context of\nconformance checking\n), \nthe SHACL instance of \nsh:ValidationReport\n in the results graph has a value for the\nproperty \nsh:result\n. Each value of \nsh:result\n is a \nSHACL instance\n \nof the class \nsh:ValidationResult\n.\n3.6.1.3 \nSyntax Checking of Shapes Graph (sh:shapesGraphWellFormed)\nSHACL validation engines are not strictly required to check whether the \nshapes graph\n is \nwell-formed\n.\nImplementations that do perform such checks (e.g., when the shapes graph is installed in the system, or before or\nduring the validation) \nshould\n use the property \nsh:shapesGraphWellFormed\n \nto inform the consumer of the validation\nreport about this fact. If a SHACL instance of \nsh:ValidationReport\n in the results graph has \ntrue\n as the \nvalue\n \nfor\nsh:shapesGraphWellFormed\n then the \nprocessor\n was certain that th

'41: Note that there is an important difference between \nsh:property\n and \nsh:node\n: If a value node is violating the\nconstraint, then there is only a single validation result for \nsh:node\n for this value \nnode, with\nsh:NodeConstraintComponent\n as its \nsh:sourceConstraintComponent\n. On the other hand side, there may be any\nnumber of validation results for \nsh:property\n, and these will have the individual \nconstraint components of the\nconstraints\n in the \nproperty shape\n as their values of \nsh:sourceConstraintComponent\n.\nLike with all other validation results, each time a \nproperty shape\n is reached via \nsh:property\n, a validation engine\nmust\n \nproduce \nfresh\n validation result nodes. This includes cases where the same \nfocus node\n is validated against the\nsame \nproperty shape\n although it is reached via different paths in the \nshapes graph\n.\n4.7.3 \nsh:qualifiedValueShape, sh:qualifiedMinCount, sh:qualifiedMaxCount\nsh:qualifiedValueShape\n speci

'22: 3.5 \nConformance Checking\nA \nfocus node\n \nconforms\n to a \nshape\n if and only if the \nset of result of the \nvalidation\n of the \nfocus node\n against the \nshape\nis empty and no \nfailure\n has been reported by it.\nConformance checking\n produces \ntrue\n if and only if a given \nfocus node\n \nconforms\n to a given \nshape\n, and \nfalse\notherwise.\nNote that some \nconstraint components\n of SHACL Core (e.g., those of \nsh:not\n, \nsh:or\n and \nsh:node\n) rely on\nconformance checking. \nIn these cases, the \nvalidation results\n used to determine the outcome of conformance\nchecking are separated from those of the surrounding validation process \nand typically do not end up in the same\nvalidation report (except perhaps as values of \nsh:detail\n).\n3.6 \nValidation Report\nThe \nvalidation report\n is the result of the \nvalidation\n process that reports the \nconformance\n and the set of all\nvalidation results\n. The validation report is described with the SHAC

'24: this may for example include violations of constraints that have been \nevaluated as part of conformance checking via\nsh:node\n.\n3.6.2.7 \nMessage (sh:resultMessage)\nValidation results may have values for the property \nsh:resultMessage\n, for example to communicate additional textual\ndetails to humans. While \nsh:resultMessage\n may have multiple values, there should not \nbe two values with the same\nlanguage tag. These values are produced by a validation engine based on the values of \nsh:message\n of the\nconstraints in the shapes graph, see \nDeclaring Messages for a Shape\n. \nIn cases where a constraint does not have\nany values for \nsh:message\n in the shapes graph the SHACL processor \nmay\n automatically generate other values for\nsh:resultMessage\n.\n3.6.2.8 \nSeverity (sh:resultSeverity)\nEach validation result has exactly one \nvalue\n for the property \nsh:resultSeverity\n, and this value is an \nIRI\n. \nThe value is\nequal to the \nvalue\n of \nsh:severity\n o