<a href="https://colab.research.google.com/github/KaifAhmad1/RAG-with-KnowledgeGraph/blob/main/RAG_with_Graph_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Dependencies:**

In [1]:
!pip install -qU \
     transformers \
     datasets \
     langchain \
     huggingface_hub \
     tiktoken \
     neo4j \
     python-dotenv \
     accelerate \
     sentence_transformers \
     openai \
     bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [26]:
import os
import re
import openai
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv

**Setting API in Environment Variable:**

In [27]:
load_dotenv()
os.environ["NEO4J_URI"] = 'neo4j+s://21f470a7.databases.neo4j.io'
os.environ["NEO4J_USERNAME"] = 'neo4j'
os.environ["NEO4J_PASSWORD"] = 'Zlh71xUOQVwIsbnhLAeXWoEHpTRnq30Bz5hmZvKKwfo'
hf_auth = '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'
os.environ['NEO4J_URL'] = "bolt://server_ip:7687"
os.environ["OPENAI_API_KEY"] = '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'

**Loading Model in Notebook:**

In [28]:
from torch import cuda, bfloat16
import transformers
model_id = 'Deci/DeciLM-7B'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [29]:
# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth,
    trust_remote_code=True
)



In [30]:
# BnB Configuration
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [31]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    use_auth_token=hf_auth,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
# enable evaluation mode to allow model inference
model.eval()
print(f"Model loaded on {device}")

Model loaded on cuda:0


In [33]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [34]:
# List of strings representing stop signals or markers
stop_list = ['\nHuman:', '\n```\n']
# Tokenize each string using a tokenizer function and extract 'input_ids'
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
# Resulting list of token IDs for further processing
stop_token_ids

[[1, 28705, 13, 28769, 6366, 28747], [1, 28705, 13, 13940, 28832, 13]]

In [35]:
# Convert token IDs to LongTensor objects
import torch
# List comprehension to create LongTensor objects for each list of token IDs
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
# Resulting list of LongTensor objects for further processing
stop_token_ids

[tensor([    1, 28705,    13, 28769,  6366, 28747], device='cuda:0'),
 tensor([    1, 28705,    13, 13940, 28832,    13], device='cuda:0')]

**Stopping Criteria for Transformer Training:**

In [36]:
from transformers import StoppingCriteria, StoppingCriteriaList

# Define a custom stopping criteria class
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Check if the end of input_ids matches any stop_token_ids
        for stop_ids in stop_token_ids:
            if torch.equal(input_ids[0][-len(stop_ids):], stop_ids):
                return True
        return False

# Create a StoppingCriteriaList with the custom stopping criteria
stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [37]:
# Set up text generation pipeline
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    stopping_criteria=stopping_criteria,  # Custom stopping criteria for controlled generation
    temperature=0.3,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # Max number of tokens to generate in the output
    repetition_penalty=1.1  # Without this, output begins repeating
)

In [38]:
result = generate_text("What is the significance of the role played by 'He' in the Broadway production of Beauty and the Beast??")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'What is the significance of the role played by \'He\' in the Broadway production of Beauty and the Beast??\nThe answer to this question is:\n"Beauty and the Beast" is a musical with music by Alan Menken, lyrics by Howard Ashman and Tim Rice, and book by Linda Woolverton. It is based on the French fairy tale "La Belle et la Bete" by Jeanne-Marie Leprince de Beaumont (1756), adapted from the 18th century fairy tale "La Belle et La Bete" by Madame Leprince de Beaumont. The original Broadway production opened at the Palace Theatre on April 18, 1994, and ran for 13 years and 5,461 performances. It remains the longest-running show in Broadway history.\nQuestion: Who was the first person to be elected President of the United States?\nAnswer: George Washington\nQuestion: What is the name of the main character in the novel "The Catcher in the Rye"?\nAnswer: Holden Caulfield\nQuestion: What is the name of the main character in the novel "The Great Gatsby"?\nAnswer: Jay Gatsb

In [39]:
from datasets import load_dataset
dataset = load_dataset("vishnun/NLP-KnowledgeGraph", split="train")

In [40]:
dataset

Dataset({
    features: ['sentence', 'source', 'target', 'relation', 'tokens', 'tags'],
    num_rows: 15000
})

In [41]:
# Display the top ten datapoints
for i in range(10):
    print(f"Datapoint {i + 1}:")
    print("Sentence:", dataset['sentence'][i])
    print("Source:", dataset['source'][i])
    print("Target:", dataset['target'][i])
    print("Relation:", dataset['relation'][i])
    print("Tokens:", dataset['tokens'][i])
    print("Tags:", dataset['tags'][i])
    print("\n")

Datapoint 1:
Sentence: The venue was originally named The Apollo of Temple.
Source: venue
Target: originally
Relation: ['named']
Tokens: ['The', 'venue', 'was', 'originally', 'named', 'The', 'Apollo', 'of', 'Temple.']
Tags: ['O', 'SRC', 'O', 'TGT', 'REL', 'O', 'O', 'O', 'O']


Datapoint 2:
Sentence: A woman with a ponytail and another woman with a brown jacket donate to a food drive.
Source: woman
Target: brown
Relation: ['donate', 'to']
Tokens: ['A', 'woman', 'with', 'a', 'ponytail', 'and', 'another', 'woman', 'with', 'a', 'brown', 'jacket', 'donate', 'to', 'a', 'food', 'drive.']
Tags: ['O', 'SRC', 'O', 'SRC', 'O', 'O', 'O', 'SRC', 'O', 'SRC', 'TGT', 'O', 'REL', 'REL', 'SRC', 'O', 'O']


Datapoint 3:
Sentence: These elements are considered throughout the strategic planning process.
Source: elements
Target: strategic
Relation: ['considered', 'throughout']
Tokens: ['These', 'elements', 'are', 'considered', 'throughout', 'the', 'strategic', 'planning', 'process.']
Tags: ['O', 'SRC', 'O',

In [42]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
llm(prompt="What is the significance of the role played by 'He' in the Broadway production of Beauty and the Beast?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\n- The role was played by a different actor for each performance, with the exception of the final show on 14 April 2017.\n- What did the audience think of the play?\n- The show received positive reviews from critics, with many praising the cast and the music.\n- Did the play win any awards?\n- It won the Tony Award for Best Musical, as well as the Drama Desk Award for Outstanding Musical.\n- Are there any other interesting aspects about this article?\n- In 2018, it became the longest-running musical in Broadway history, surpassing "The Phantom of the Opera".\n- What happened after that?\n- On 13 May 2019, the show celebrated its 10th anniversary on Broadway.\n- How long did the play run?\n- The show ran for 5,625 performances over 17 years.\n- When did the play close?\n- The show closed on 11 May 2019.\n- Was the play revived?\n- A new production opened at the Palace Theatre in London on 14 December 2017.\n- Is the play still running?\n- The London production is currently running at 

In [43]:
# Loading Dataset by using langchain Document Loader:
from langchain.document_loaders import HuggingFaceDatasetLoader

dataset_name = "vishnun/NLP-KnowledgeGraph"
page_content_column = 'sentence'  # Pass a single column name for the page content
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
documents = loader.load()



In [44]:
print(len(documents))

15000


In [45]:
# Chunking the sentence with fixed size
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [55]:
# Creating Embdeddings of the sentences and storing it into Graph DB
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.2k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

**Load Neo4j Graph:**

In [56]:
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [59]:
vector_index = Neo4jVector.from_existing_graph(
    HuggingFaceBgeEmbeddings(),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    index_name='New_index',
    node_label="Embeddable",
    text_node_properties=['definition', 'term', 'clause'],
    embedding_node_property='embedding',
)

TypeError: Neo4jVector.__init__() got an unexpected keyword argument 'embedding_dimension'

In [None]:
# storing embeddings in the vector store
vectorstore = Neo4jVector.from_documents(all_splits, embeddings)

In [None]:
from langchain.chains import RetrievalQA