In [None]:
%pip install jsonformer
%pip install relik
%pip install gliner
%pip install sentencepiece
%pip install --upgrade transformers

In [7]:
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama
from neo4j import GraphDatabase
# from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

from dotenv import load_dotenv

from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel
from langchain_huggingface import HuggingFacePipeline
from llm import LLMGraphTransformer

import torch
from typing import Optional, List
from langchain_core.messages import SystemMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    PromptTemplate,
)
import json_repair
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document
from jsonformer import Jsonformer
from relik.inference.data.objects import RelikOutput
from gliner import GLiNER
from langchain_huggingface import HuggingFaceEmbeddings

In [3]:
def getDocuments(path):
    if not os.path.exists(path):
        print("Path do not exist")
        return []
    documents = []
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            content = ""
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except Exception as e:
                print(f"Errore nella lettura di {file_path}: {e}")

            documents.append(Document(
                page_content=content,
                metadata={"path": file_path,
                        "title": content.split("\n")[2],
                        "file_name": file}))
    return documents

def cleanMarkup(llm, text):
    template = """
    You are given a markup text. Your task is to remove any unnecessary or non-informative parts, such as:
    - Tags, unless they contain useful content.
    - Repeated phrases or sections.
    - Decorative characters or symbols.
    - Empty lines or spaces.

    Please leave informative links.

    Output only the cleaned text, without any additional explanation or markup.

    {text}
    """

    prompt = PromptTemplate.from_template(template)

    chain = prompt | llm

    return chain.invoke({"text": text}) 

def cleanDocuments(llm, documents, create_files=False):
    if create_files:
        output_directory = "cleaned_documents"
        os.makedirs(output_directory, exist_ok=True)
    i = 0
    for d in documents:
        i+=1
        print(f"Cleaning the document number: {i}/{len(documents)}")
        d.page_content = cleanMarkup(llm, d.page_content)
        if create_files:
            output_path = os.path.join(output_directory, d.metadata['file_name'])
            with open(output_path, "w") as file:
                file.write(d.page_content)
    return documents

In [4]:
class Entities(BaseModel):
    """Identifying information about entities."""

    names: list[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the input text",
    )
    
class UnstructuredRelation(BaseModel):
    head: str = Field(
        description=(
            "extracted head entity like Microsoft, Apple, John. "
            "Must use human-readable unique identifier."
        )
    )
    head_type: str = Field(
        description="type of the extracted head entity like Person, Company, etc"
    )
    relation: str = Field(description="relation between the head and the tail entities")
    tail: str = Field(
        description=(
            "extracted tail entity like Microsoft, Apple, John. "
            "Must use human-readable unique identifier."
        )
    )
    tail_type: str = Field(
        description="type of the extracted tail entity like Person, Company, etc"
    )

def getHuggingFaceModel(model_id, hf_token):
    # model_id = "microsoft/Phi-3-mini-128k-instruct"
    # model_id = "microsoft/Phi-3-mini-4k-instruct"
    #model_id = "microsoft/Phi-3.5-mini-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id, token = hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=BitsAndBytesConfig(load_in_4bit=True),
        device_map="cuda",
        trust_remote_code = True, #Added for Phi-3-mini-128k
        token = hf_token
        #attn_implementation="flash_attention_2", # if you have an ampere GPU (RTX3090 OK, T4(Colab) NON OK)
    )
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=1024,
                    top_k=50,
                    temperature=0.1,
                    return_full_text = False)
    llm = HuggingFacePipeline(pipeline=pipe,
                            pipeline_kwargs={"return_full_text": False}) # <----- IMPORTANT !!!
    return tokenizer, model, pipe, llm

def getHuggingFacePipe(model_id, hf_token):
    # model_id = "microsoft/Phi-3-mini-128k-instruct"
    # model_id = "microsoft/Phi-3-mini-4k-instruct"
    #model_id = "microsoft/Phi-3.5-mini-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id, token = hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=BitsAndBytesConfig(load_in_4bit=True),
        device_map="cuda",
        trust_remote_code = True, #Added for Phi-3-mini-128k
        token = hf_token
        #attn_implementation="flash_attention_2", # if you have an ampere GPU (RTX3090 OK, T4(Colab) NON OK)
    )
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=1024,
                    top_k=50,
                    temperature=0.1,
                    return_full_text = False)
    return pipe

def extractNodesAndRelationships(graph_document):
    node_types = set()
    relationships = set()
    for d in graph_documents:
        for n in d.nodes:
            node_types.add(n.type)
        for r in d.relationships:
            relationships.add(r.type)
    return node_types, relationships

def getCUDAMemoryInfo():
    gc.collect()
    torch.cuda.empty_cache()
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(t)
    print(r)
    print(a)
    print(f)

In [5]:
def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()

# Fulltext index query
def graph_retriever(question: str, graph, gliner_model, entities) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    # entities = entity_chain.invoke({"question": question})
    # entities = [s.text for s in relik(question).spans]
    entities = [entity["text"] for entity in gliner_model.predict_entities(question, entities)]
    print(entities)
    for entity in entities:
        attempts = 0
        max_attempts = 5
        while attempts < max_attempts:
            try:
                response = graph.query(
                    """CALL db.index.fulltext.queryNodes('keyword', $query, {limit:2})
                    YIELD node,score
                    CALL {
                      WITH node
                      MATCH (node)-[r:!MENTIONS]->(neighbor)
                      RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                      UNION ALL
                      WITH node
                      MATCH (node)<-[r:!MENTIONS]-(neighbor)
                      RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
                    }
                    RETURN output LIMIT 50
                    """,
                    {"query": generate_full_text_query(entity)},
                )
                result += "\n".join([el['output'] for el in response])
            except Exception as e:
                attempts += 1
                print(f"Attempt {attempts} failed due to {type(e).__name__}: {str(e)}")
                
                if attempts == max_attempts:
                    print(f"Max attempts reached. Unable to run the query.")
                    raise
    return result

def graph_retriever_2(question: str, graph, gliner_model, entities) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    entities = [entity["text"] for entity in gliner_model.predict_entities(question, entities)]
    print(entities)
    
    result = ""
    for entity in entities:
        attempts = 0
        max_attempts = 5
        while attempts < max_attempts:
            try:
                response = graph.query(
                    """CALL db.index.fulltext.queryNodes('keyword', $query, {limit:2})
                    YIELD node, score
                    RETURN node, score
                    """,
                    {"query": generate_full_text_query(entity)},
                )
                
                print("Nodes and Scores:")
                for record in response:
                    node = record['node']
                    score = record['score']
                    node_id = node.get("id")
                    node_name = node.get("file_name")
                    node_title = node.get("title")
                    print(f"Node ID: {node_id}, Title: {node_title}, Name: {node_name}, Score: {score}")
                
                result += "\n".join([el['output'] for el in response if 'output' in el])
                break
            except Exception as e:
                attempts += 1
                print(f"Attempt {attempts} failed due to {type(e).__name__}: {str(e)}")
                
                if attempts == max_attempts:
                    print(f"Max attempts reached. Unable to run the query.")
                    raise
    return result

def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
    {graph_data}
    vector data:
    {"#Document ". join(vector_data)}
        """
    return final_data

In [6]:
load_dotenv()

True

In [24]:
#LLM Model
tokenizer, model, pipe, llm = getHuggingFaceModel(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct", hf_token = os.getenv("HF_TOKEN"))
llm_transformer = LLMGraphTransformer(llm=llm)

#Embedding Model
model_name = "nomic-ai/nomic-embed-text-v1.5"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"trust_remote_code": True}
)

#NER Model
gliner_model = GLiNER.from_pretrained("urchade/gliner_multi")

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it]


[2024-09-11 17:22:58,969] [INFO] [sentence_transformers.SentenceTransformer.__init__:113] [PID:19406] Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1.5
[2024-09-11 17:23:02,703] [INFO] [sentence_transformers.SentenceTransformer.__init__:219] [PID:19406] Use pytorch device_name: cuda


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 40524.68it/s]


In [9]:
# loader = TextLoader(file_path="dummytext.txt")
# docs = loader.load()
crawl_documents = getDocuments("crawl")

# cleaned_documents = cleanDocuments(llm, documents)
cleaned_documents = getDocuments("cleaned_documents") #Replace with row above

#Documents splitting
# chuck_size = 750
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chuck_size, chunk_overlap=chuck_size*0.2)
# documents = text_splitter.split_documents(documents=cleaned_documents)
# print(f"Went from {len(cleaned_documents)} documents to {len(documents)} splitted documents")

selected_file_names = ["155.txt", "60.txt", "42.txt", "99.txt", "124.txt", "117.txt", "207.txt", "8.txt", "77.txt"]
selected_documents = [d for d in cleaned_documents if d.metadata["file_name"] in selected_file_names]
documents = selected_documents

In [44]:
JSON_object_schema = {
    "type": "object",
    "properties": {
        "triplets": {
            "type": "array",
            "items": {
                "type": "object",
                "minItems": 5,
                "properties": {
                    "head": { "type": "string" },
                    "head_type": { "type": "string" },
                    "relation": { "type": "string" },
                    "tail": { "type": "string" },
                    "tail_type": { "type": "string" }
                },
                "required": ["head", "head_type", "relation", "tail", "tail_type"]
            }
        }
    },
    "required": ["triplets"]
}


prompt = f"""
Extract at least 5 triplets from the following text in the format of head node, head node type, relation, tail node, and tail node type. 
Ensure the extracted triplets follow this JSON schema structure:

1. Each triplet must include:
   - head: the subject entity
   - head_type: type/category of the head entity
   - relation: the relationship between the head and tail
   - tail: the object entity
   - tail_type: type/category of the tail entity

2. The output must strictly follow this format in the 'triplets' array, without any additional information.

Now, generate the 5 or moreknowledge graph triplets based on the provided text, formatted according to this schema:

{documents[1].page_content}
"""

jsonformer = Jsonformer(model, tokenizer, JSON_object_schema, prompt)
generated_data = jsonformer()

In [47]:
generated_data.get("triplets")

[{'head': 'Degree Programme',
  'head_type': 'Academic Programme',
  'relation': 'sets',
  'tail': 'education objectives',
  'tail_type': 'Learning Outcomes'}]

In [10]:
graph_documents = llm_transformer.convert_to_graph_documents(documents[:50])



155.txt
42.txt
99.txt
124.txt
117.txt
60.txt
207.txt
8.txt
77.txt


In [20]:
# #Removing invalid nodes
for d in graph_documents:
    print(len(d.nodes))
    for n in d.nodes:
        print(n)
        # if n.id == "" or n.id is None or n.type == "" or n.type is None:
        #     print(n)
        #     d.nodes.remove(n)

0
0
0
0
0
0
0
0
0


In [21]:
graph = Neo4jGraph(url= "neo4j+s://bbef2ff2.databases.neo4j.io", username="neo4j", password="fdZslu0qGuZhCiR9pasipKRR-iLDgz9AMp8KVS9Uf2s")
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [None]:
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

In [None]:
node_types, relationships = extractNodesAndRelationships(graph_documents)

In [None]:
node_types

In [None]:
questions = {
    1: "Who is the head of the Computer Science department at Stanford University?",
    2: "What do you know about Elon Musk and SpaceX?",
    3: "Could you tell me who is the CEO of Tesla and what the name of the founder of SpaceX is?",
    4: "I am looking for information about the current President of the United States and the Prime Minister of the United Kingdom.",
    5: "Who wrote the book 'To Kill a Mockingbird' and what is the name of the author of '1984'?",
    6: "Can you provide the names of the actors who starred in the film 'Inception' and who directed it?",
    7: "I want to know who the Chancellor of Germany is and what the name of the leader of the French Republic is.",
    8: "Please give me the names of the people who won the Nobel Prize in Literature in 2023 and 2022.",
    9: "What are the names of the characters portrayed by Keanu Reeves in the Matrix series and who is the director of the film?",
    10: "Can you identify the name of the founder of Microsoft and the person who is the current Secretary-General of the United Nations?",
    11: "Tell me who the lead vocalist of the band Coldplay is and the name of the band that performed 'Bohemian Rhapsody'.",
    12: "Who are the authors of the books 'The Catcher in the Rye' and 'Pride and Prejudice'?",
    13: "Where is the Università di Bologna?"
}

student_questions = {
    1: "Where can I find the academic calendar and important deadlines?",
    2: "How do I register for classes, and what is the process for adding or dropping courses?",
    3: "What are the requirements for my major, and where can I find my degree plan?",
    4: "Where is the library, and how do I access online resources or research databases?",
    5: "How can I contact my academic advisor, and when should I meet with them?",
    6: "What is the best way to get involved in student organizations or extracurricular activities?",
    7: "Where can I find information about on-campus housing and meal plans?",
    8: "How do I get my student ID card, and what is it used for?",
    9: "What health and wellness services are available on campus?",
    10: "Where do I go if I need help with my coursework or tutoring services?",
    11: "How do I apply for financial aid, and where can I check my student account or pay tuition?",
    12: "What are the university’s policies on academic integrity and plagiarism?",
    13: "Where is the career center, and how can it help me with internships or job placements?",
    14: "What campus safety resources are available, and how can I contact campus security?",
    15: "How do I access the university’s transportation system or find parking on campus?"
}


In [None]:
# for q in student_questions.values():
#     entities = model.predict_entities(q, labels)
    
#     print("\n" + q)
#     for entity in entities:
#         print(entity["text"], "=>", entity["label"])

In [None]:
print("Output: " + graph_retriever_2("Does Unibo has a Youtube account?", graph, gliner_model, node_types))