# Importing Modules

In [1]:
from transformers import pipeline
import torch
import numpy as np
import random
import os
from llama_index.core.graph_stores import SimpleGraphStore
from pprint import pprint
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


# Config

In [2]:
class Config:
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    seed = 42

# Loading Rebel Model

In [3]:
triplet_extractor = pipeline(
    'text2text-generation', 
    model='Babelscape/rebel-large', 
    tokenizer='Babelscape/rebel-large',
)

# Helper functions

In [4]:
def extract_triplets(text):
    """
    
    Function to extract triplets from a text chunk
    
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

def set_seed(cls, seed: int = Config.seed):
    """
    
    Function to set the seed for the entire notebook
    
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(Config.seed)

# Running a sample example

In [5]:
text = "Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic"
extracted_text = triplet_extractor.tokenizer.batch_decode(
    [
        triplet_extractor(
            text, 
            return_tensors=True, 
            return_text=False
        )[0]["generated_token_ids"]
    ]
)

In [6]:
extracted_text[0]

'<s><triplet> Punta Cana <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> Higuey <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> La Altagracia Province <subj> Dominican Republic <obj> country <triplet> Dominican Republic <subj> La Altagracia Province <obj> contains administrative territorial entity</s>'

In [7]:
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

[{'head': 'Punta Cana', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Punta Cana', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Higuey', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Higuey', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'La Altagracia Province', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Dominican Republic', 'type': 'contains administrative territorial entity', 'tail': 'La Altagracia Province'}]


In [8]:
for et in extracted_triplets:
    print(et)

{'head': 'Punta Cana', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}
{'head': 'Punta Cana', 'type': 'country', 'tail': 'Dominican Republic'}
{'head': 'Higuey', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}
{'head': 'Higuey', 'type': 'country', 'tail': 'Dominican Republic'}
{'head': 'La Altagracia Province', 'type': 'country', 'tail': 'Dominican Republic'}
{'head': 'Dominican Republic', 'type': 'contains administrative territorial entity', 'tail': 'La Altagracia Province'}


# Loading Embedding Model

In [9]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('BAAI/bge-reranker-base')

No sentence-transformers model found with name BAAI/bge-reranker-base. Creating a new one with MEAN pooling.
Some weights of XLMRobertaModel were not initialized from the model checkpoint at BAAI/bge-reranker-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
triplets = []
for et in extracted_triplets:
    triplets.append(str(tuple(et.values())))
triplets

["('Punta Cana', 'located in the administrative territorial entity', 'La Altagracia Province')",
 "('Punta Cana', 'country', 'Dominican Republic')",
 "('Higuey', 'located in the administrative territorial entity', 'La Altagracia Province')",
 "('Higuey', 'country', 'Dominican Republic')",
 "('La Altagracia Province', 'country', 'Dominican Republic')",
 "('Dominican Republic', 'contains administrative territorial entity', 'La Altagracia Province')"]

In [11]:
queries = ['What is Punta Cana?', 'Where is Punta Cana located?']
instruction = "Generate a representation for this sentence for use in retrieving related articles."

q_embeddings = embedder.encode([instruction+q for q in queries], normalize_embeddings=True)
p_embeddings = embedder.encode(triplets, normalize_embeddings=True)
scores = q_embeddings @ p_embeddings.T
scores

array([[0.99166465, 0.9725458 , 0.9970554 , 0.850217  , 0.94347656,
        0.95927215],
       [0.98890257, 0.96781373, 0.99530137, 0.8407945 , 0.9369699 ,
        0.9536396 ]], dtype=float32)

In [12]:
for idx, query in enumerate(queries):
    scores_mapper = dict(sorted(dict(zip(triplets, scores[idx].tolist())).items(), key = lambda x: x[1]))
    print(query)
    print("="*50)
    pprint(scores_mapper)

What is Punta Cana?
{"('Dominican Republic', 'contains administrative territorial entity', 'La Altagracia Province')": 0.9592721462249756,
 "('Higuey', 'country', 'Dominican Republic')": 0.8502169847488403,
 "('Higuey', 'located in the administrative territorial entity', 'La Altagracia Province')": 0.9970554113388062,
 "('La Altagracia Province', 'country', 'Dominican Republic')": 0.9434765577316284,
 "('Punta Cana', 'country', 'Dominican Republic')": 0.9725458025932312,
 "('Punta Cana', 'located in the administrative territorial entity', 'La Altagracia Province')": 0.9916646480560303}
Where is Punta Cana located?
{"('Dominican Republic', 'contains administrative territorial entity', 'La Altagracia Province')": 0.9536396265029907,
 "('Higuey', 'country', 'Dominican Republic')": 0.8407945036888123,
 "('Higuey', 'located in the administrative territorial entity', 'La Altagracia Province')": 0.995301365852356,
 "('La Altagracia Province', 'country', 'Dominican Republic')": 0.9369698762893

# Building KG

In [13]:
from llama_index.core import KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader

In [14]:
document_path = "Data/"

def metadata_generator(filename: str) -> Dict:
    """
    
    Callback function to generate metadata from the textchunks
    
    """
    document_name = filename.split("/")[-1].split(".")[0].strip()
    return {
        "document_name": document_name
    }

document_loader = SimpleDirectoryReader(
    input_dir = document_path,
    filename_as_id = True,
    file_metadata = metadata_generator
)
documents = document_loader.load_data()
len(documents)

8

In [15]:
documents

[Document(id_='/mnt/d/side-projects/TextCraft-SynthoSeeker/service/Notebooks/Data/1_bit_net_paper.pdf_part_0', embedding=None, metadata={'page_label': '1', 'file_name': '/mnt/d/side-projects/TextCraft-SynthoSeeker/service/Notebooks/Data/1_bit_net_paper.pdf', 'document_name': '1_bit_net_paper'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='The Era of 1-bit LLMs:\nAll Large Language Models are in 1.58 Bits\nShuming Ma∗Hongyu Wang∗Lingxiao Ma Lei Wang Wenhui Wang\nShaohan Huang Li Dong Ruiping Wang Jilong Xue Furu Wei⋄\nhttps://aka.ms/GeneralAI\nAbstract\nRecent research, such as BitNet [ WMD+23], is paving the way for a new era of 1-\nbit Large Language Models (LLMs). In this work, we introduce a 1-bit LLM variant,\nnamely BitNet b1.58 , in which every 

In [16]:
!pip install bitsandbytes accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [18]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.service_context import ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
from huggingface_hub import login
    

login("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")

# Transform a string into input zephyr-specific input
def completion_to_prompt(completion):
    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"


# Transform a list of chat messages into zephyr-specific input
def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == "user":
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == "assistant":
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"

    return prompt


embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-reranker-base"
)
# llm = HuggingFaceLLM(
#     model_name="google/gemma-2b",
#     tokenizer_name="google/gemma-2b",
#     context_window=3900,
#     max_new_tokens=256,
#     generate_kwargs={"temperature": 0.0, "top_k": 50, "top_p": 0.95},
#     messages_to_prompt=messages_to_prompt,
#     completion_to_prompt=completion_to_prompt,
#     device_map="auto",
# )

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)
service_context = ServiceContext.from_defaults(
    # llm=llm, 
    embed_model=embed_model
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/hotson/.cache/huggingface/token
Login successful


Some weights of XLMRobertaModel were not initialized from the model checkpoint at BAAI/bge-reranker-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  service_context = ServiceContext.from_defaults(


ValueError: 
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

To disable the LLM entirely, set llm=None.
******

In [None]:
from llama_index.core import PromptTemplate


# Transform a string into input zephyr-specific input
def completion_to_prompt(completion):
    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"


# Transform a list of chat messages into zephyr-specific input
def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == "user":
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == "assistant":
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"

    return prompt


import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

Settings.llm = HuggingFaceLLM(
    model_name="google/gemma-2b",
    tokenizer_name="google/gemma-2b",
    context_window=3900,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    device_map="auto",
)

ModuleNotFoundError: No module named 'llama_index.llms.huggingface'

In [None]:
from llama_index.core import Settings

# Settings.llm = None

index = KnowledgeGraphIndex.from_documents(
    documents = documents,
    max_triplets_per_chunk = 256,
    kg_triplet_extract_fn = extract_triplets,
    storage_context = storage_context,
    include_embeddings = True
)

In [None]:
index.as_retriever().retrieve("1 bit")

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk_amsld******************lsmd. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}