<a href="https://colab.research.google.com/github/KaifAhmad1/RAG-with-KnowledgeGraph/blob/main/RAG_with_Graph_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Dependencies:**

In [None]:
!pip install -qU \
     transformers \
     datasets \
     langchain \
     huggingface_hub \
     tiktoken \
     neo4j \
     python-dotenv \
     accelerate \
     sentence_transformers

In [None]:
import os
import re
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv

**Setting API in Environment Variable:**

In [None]:
load_dotenv()
os.environ["NEO4J_URI"] = 'neo4j+s://07e87ccd.databases.neo4j.io'
os.environ["NEO4J_USERNAME"] = 'neo4j'
os.environ["NEO4J_PASSWORD"] = 'KmTNiJfRGOwj-O3jN5EIEARHrngbBSgn_YTL8N6zW08'
hf_auth = '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'

**Loading Model in Notebook:**

In [None]:
from torch import cuda, bfloat16
import transformers
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    use_auth_token=hf_auth,
    low_cpu_mem_usage=True
)



In [None]:
# enable evaluation mode to allow model inference
model.eval()
print(f"Model loaded on {device}")

Model loaded on cpu


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [None]:
# List of strings representing stop signals or markers
stop_list = ['\nHuman:', '\n```\n']
# Tokenize each string using a tokenizer function and extract 'input_ids'
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
# Resulting list of token IDs for further processing
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [None]:
# Convert token IDs to LongTensor objects
import torch
# List comprehension to create LongTensor objects for each list of token IDs
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
# Resulting list of LongTensor objects for further processing
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901]),
 tensor([    1, 29871,    13, 28956,    13])]

**Stopping Criteria for Transformer Training:**

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList

# Define a custom stopping criteria class
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Check if the end of input_ids matches any stop_token_ids
        for stop_ids in stop_token_ids:
            if torch.equal(input_ids[0][-len(stop_ids):], stop_ids):
                return True
        return False

# Create a StoppingCriteriaList with the custom stopping criteria
stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [None]:
# Set up text generation pipeline
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    stopping_criteria=stopping_criteria,  # Custom stopping criteria for controlled generation
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # Max number of tokens to generate in the output
    repetition_penalty=1.1  # Without this, output begins repeating
)

In [None]:
result = generate_text("What is the significance of the role played by 'He' in the Broadway production of Beauty and the Beast??")
print(result)



[{'generated_text': "What is the significance of the role played by 'He' in the Broadway production of Beauty and the Beast??"}]


In [None]:
from datasets import load_dataset
dataset = load_dataset("vishnun/NLP-KnowledgeGraph", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset

Dataset({
    features: ['sentence', 'source', 'target', 'relation', 'tokens', 'tags'],
    num_rows: 15000
})

In [None]:
# Display the top ten datapoints
for i in range(10):
    print(f"Datapoint {i + 1}:")
    print("Sentence:", dataset['sentence'][i])
    print("Source:", dataset['source'][i])
    print("Target:", dataset['target'][i])
    print("Relation:", dataset['relation'][i])
    print("Tokens:", dataset['tokens'][i])
    print("Tags:", dataset['tags'][i])
    print("\n")

Datapoint 1:
Sentence: The venue was originally named The Apollo of Temple.
Source: venue
Target: originally
Relation: ['named']
Tokens: ['The', 'venue', 'was', 'originally', 'named', 'The', 'Apollo', 'of', 'Temple.']
Tags: ['O', 'SRC', 'O', 'TGT', 'REL', 'O', 'O', 'O', 'O']


Datapoint 2:
Sentence: A woman with a ponytail and another woman with a brown jacket donate to a food drive.
Source: woman
Target: brown
Relation: ['donate', 'to']
Tokens: ['A', 'woman', 'with', 'a', 'ponytail', 'and', 'another', 'woman', 'with', 'a', 'brown', 'jacket', 'donate', 'to', 'a', 'food', 'drive.']
Tags: ['O', 'SRC', 'O', 'SRC', 'O', 'O', 'O', 'SRC', 'O', 'SRC', 'TGT', 'O', 'REL', 'REL', 'SRC', 'O', 'O']


Datapoint 3:
Sentence: These elements are considered throughout the strategic planning process.
Source: elements
Target: strategic
Relation: ['considered', 'throughout']
Tokens: ['These', 'elements', 'are', 'considered', 'throughout', 'the', 'strategic', 'planning', 'process.']
Tags: ['O', 'SRC', 'O',

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
llm(prompt="What is the significance of the role played by 'He' in the Broadway production of Beauty and the Beast?")

  warn_deprecated(


''

In [None]:
# Loading Dataset by using langchain Document Loader:
from langchain.document_loaders import HuggingFaceDatasetLoader

dataset_name = "vishnun/NLP-KnowledgeGraph"
page_content_column = 'sentence'  # Pass a single column name for the page content
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
documents = loader.load()



In [None]:
print(len(documents))

15000


In [None]:
# Chunking the sentence with fixed size
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [None]:
# Creating Embdeddings of the sentences and storing it into Graph DB
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Neo4jVector

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = Neo4jVector.from_documents(all_splits, embeddings)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

**Initializing Graph DB:**

In [None]:
# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    documents,
    HuggingFaceBgeEmbeddings(),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

KeyboardInterrupt: 