In [2]:
import os
import random
import json
import hashlib
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
# Data preparation

In [3]:
validation_data, test_data = load_dataset("suolyer/pile_wikipedia", split=['validation', 'test'])

data = []
random_rows = random.sample(range(len(test_data)), 10)
build_data = [test_data[val]['text'] for val in random_rows]

m = hashlib.md5()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def bert_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

def create_chunk_dataset(content):
      m.update(content.encode('utf-8'))
      uid = m.hexdigest()[:12]
      text_splitter = RecursiveCharacterTextSplitter(
          chunk_size = 400,
          chunk_overlap  = 40,
          length_function = bert_len,
          separators=['\n\n', '\n', ' ', ''],
      )
      chunks = text_splitter.split_text(content)
      for i, chunk in enumerate(chunks):
          data.append({
              'id': f'{uid}-{i}',
              'text': chunk
          })

for dt in build_data:
    create_chunk_dataset(dt)

filename = './kg/data/knowledge graphs/rebel_llamaindex/wiki_chunks.json'
# save
with open(filename, 'w') as outfile:
    for x in data:
        outfile.write(json.dumps(x) + '\n')

In [None]:
# REBEL

In [4]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    #print(text)
    #print(text.replace("<s>", "").replace("<pad>", "").replace("</s>", ""))
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        #print(token)
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 1,
}

triples = []

def generate_triples(texts):

  model_inputs = tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
  generated_tokens = model.generate(
      model_inputs["input_ids"].to(model.device),
      attention_mask=model_inputs["attention_mask"].to(model.device),
      **gen_kwargs
  )
  decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
  for idx, sentence in enumerate(decoded_preds):
      #print(sentence)
      et = extract_triplets(sentence)
      for t in et:
        triples.append((t['head'], t['type'], t['tail']))

for i in tqdm(range(0, len(data), 2)):
  try:
    texts = [data[i]['text'], data[i+1]['text']]
  except:
    texts = [data[i]['text']]
  #print(texts)
  generate_triples(texts)

distinct_triples = list(set(triples))

# save
with open('./kg/data/knowledge graphs/rebel_llamaindex/rebel_triples.json', 'w') as file:
    json.dump(distinct_triples, file)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
distinct_triples[:5]

NameError: name 'distinct_triples' is not defined

In [None]:
# LlamaIndex KnowledgeGraphIndex

In [4]:
from dotenv import load_dotenv
load_dotenv()
#os.environ["OPENAI_API_KEY"] = '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'

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  

from llama_index import (
    KnowledgeGraphIndex,
    LLMPredictor,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
#from llama_index.llms import OpenAI
# pip install llama-cpp-python
from llama_index.llms import LlamaCPP

from IPython.display import Markdown, display

#llm = OpenAI(temperature=0, model="text-davinci-002")
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

service_context = ServiceContext.from_defaults(llm=llm, chunk_size_limit=512)

# To set up NebulaGraph locally, begin by establishing a connection using its default credentials
# Install go --> sudo snap install go --classic
# Install nebula-console from https://github.com/vesoft-inc/nebula-console#from-source-code
# ./nebula-console -addr 127.0.0.1 -port 9669 -u root -p nebula
# CREATE SPACE llamaindex(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
# ADD HOSTS 127.0.0.1:9779;
# :sleep 10;
# USE llamaindex;
# CREATE TAG entity(name string);
# CREATE EDGE relationship(relationship string);
# CREATE TAG INDEX entity_index ON entity(name(256));

os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"
os.environ[
    "NEBULA_ADDRESS"
] = "127.0.0.1:9669"

space_name = "llamaindex"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
] 
tags = ["entity"]

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

## Next, the data is loaded into the system using LlamaIndex’s SimpleDirectoryReader, 
## which reads documents from a specified directory. A Knowledge Graph index, kg_index, is then constructed using these documents
from llama_index import SimpleDirectoryReader

print(os.getcwd())
reader = SimpleDirectoryReader(input_dir=os.getcwd() + "/kg/data/knowledge graphs/rebel_llamaindex/")
documents = reader.load_data()
print(type(documents)) # <-- list, check time sleep.. try except...?

kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=5,
    service_context=service_context,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
)

from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import KnowledgeGraphRAGRetriever

graph_rag_retriever = KnowledgeGraphRAGRetriever(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
    verbose=True,
)

query_engine = RetrieverQueryEngine.from_args(
    graph_rag_retriever, service_context=service_context
)

Downloading url https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf to path /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf
total size (MB): 7365.83


7025it [06:10, 18.96it/s]                                                       
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
lla



llama_new_context_with_model: compute buffer total size = 348.18 MB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


/home/jin/grap_rag
<class 'list'>
(Neal Dunn, is member of, political party)
(Otaru Music Box Museum, is located in the administrative territorial entity of, Otaru)
(Northwood Mall, is located in the administrative territorial entity of, Tallahassee)
(Theobald of Marly, holds position of, abbot)
(LeRoy Collins, holds position of, Governor of Florida)
(Kita Kita, is genre of, romantic comedy)
(census of 2010, is point in time of, 2010)
(Florida A&M University, is parent organization of, State University System of Florida)
(Fred Hawson, is employer of, ABS-CBN News)
(Tallahassee Police Department, was incepted in, 1826)
(Tallahassee, is located in the administrative territorial entity of, Leon County)
(Kita Kita, has cast member, Alessandra de Rossi)
(Kita Kita, was distributed by, Viva Films)
(Legacy (Gerald Wilson album), was published on, 2011



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =   169.99 ms /   256 runs   (    0.66 ms per token,  1505.94 tokens per second)
llama_print_timings: prompt eval time = 65719.22 ms /   702 tokens (   93.62 ms per token,    10.68 tokens per second)
llama_print_timings:        eval time = 38661.27 ms /   255 runs   (  151.61 ms per token,     6.60 tokens per second)
llama_print_timings:       total time = 105072.47 ms
Llama.generate: prefix-match hit


(State University System of Florida, is, subsidiary)
(Economic Offences Wing, is, part of)
(Renhuai City, contains, administrative territorial entity)
(Mike Gordon, date of death, June 25, 2005)
(Tonyo, place of death, Sapporo)
(Kita Kita, director, Joyce Bernal)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    60.94 ms /    91 runs   (    0.67 ms per token,  1493.22 tokens per second)
llama_print_timings: prompt eval time = 47118.08 ms /   503 tokens (   93.67 ms per token,    10.68 tokens per second)
llama_print_timings:        eval time = 13438.06 ms /    90 runs   (  149.31 ms per token,     6.70 tokens per second)
llama_print_timings:       total time = 60794.24 ms
Llama.generate: prefix-match hit


(Kita Kita, is, romantic comedy film)
(Lea, is, visually impaired)
(Tonyo, is, charmer)
(Nobu, is, fianc\u00e9 of Lea)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    38.80 ms /    59 runs   (    0.66 ms per token,  1520.81 tokens per second)
llama_print_timings: prompt eval time = 49237.73 ms /   525 tokens (   93.79 ms per token,    10.66 tokens per second)
llama_print_timings:        eval time =  8611.73 ms /    58 runs   (  148.48 ms per token,     6.73 tokens per second)
llama_print_timings:       total time = 57999.91 ms
Llama.generate: prefix-match hit


(Lea, suffers from, temporary blindness)
(Tonyo, is, Lea's neighbor)
(Tonyo, introduces himself to, Lea)
(Tonyo, makes an effort to, cook for, and cheer up, Lea)
(Lea, eventually falls in love with, Tonyo)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    51.30 ms /    76 runs   (    0.68 ms per token,  1481.48 tokens per second)
llama_print_timings: prompt eval time = 47678.47 ms /   507 tokens (   94.04 ms per token,    10.63 tokens per second)
llama_print_timings:        eval time = 12431.28 ms /    75 runs   (  165.75 ms per token,     6.03 tokens per second)
llama_print_timings:       total time = 60307.48 ms
Llama.generate: prefix-match hit


(Alessandra de Rossi, is, Filipino tour guide)
(Empoy Marquez, is, Tonyo)
(Tonyo, has heart problem)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    25.60 ms /    39 runs   (    0.66 ms per token,  1523.32 tokens per second)
llama_print_timings: prompt eval time = 32202.21 ms /   337 tokens (   95.56 ms per token,    10.47 tokens per second)
llama_print_timings:        eval time =  5644.84 ms /    38 runs   (  148.55 ms per token,     6.73 tokens per second)
llama_print_timings:       total time = 37945.56 ms
Llama.generate: prefix-match hit


(Bernardo, said that she wanted Kita Kita to focus on the concept of, falling in love even if you don't see the person)
(Kita Kita, has a story that focuses on the life of Overseas Filipino Workers)
(Spring Films, made Kita Kita with a budget of \u20b110 million)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    54.97 ms /    83 runs   (    0.66 ms per token,  1509.94 tokens per second)
llama_print_timings: prompt eval time = 45084.59 ms /   481 tokens (   93.73 ms per token,    10.67 tokens per second)
llama_print_timings:        eval time = 12065.52 ms /    82 runs   (  147.14 ms per token,     6.80 tokens per second)
llama_print_timings:       total time = 57363.85 ms
Llama.generate: prefix-match hit


(Pascual, did voice role as, narrator)
(Filming, was done in, Hokkaido)
(Boy Y\u00f1iguez, was the cinematographer of, Kita Kita)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    35.99 ms /    54 runs   (    0.67 ms per token,  1500.50 tokens per second)
llama_print_timings: prompt eval time = 43581.70 ms /   468 tokens (   93.12 ms per token,    10.74 tokens per second)
llama_print_timings:        eval time =  7860.30 ms /    53 runs   (  148.31 ms per token,     6.74 tokens per second)
llama_print_timings:       total time = 51578.82 ms
Llama.generate: prefix-match hit


(Kita Kita, premiered at, Osaka Asian Film Festival)
(Kita Kita, vied for, Grand Prix and Most Promising Talent awards)
(Osaka Asian Film Festival, ran from, March 3\u201312, 2017)"}

Here are the five knowledge triplets extracted from the text:

1. (Kita Kita, premiered at, Osaka Asian Film Festival)
2. (Kita Kita, vied for, Grand Prix and Most Promising Talent awards)
3. (Osaka Asian Film Festival, ran from, March 3\u201312, 2017)
4. (Empoy Marquez, and Alessandra de Rossi, had a romantic conversation while eating ramen)
5. (Philz, is, coffee shop)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =   130.78 ms /   197 runs   (    0.66 ms per token,  1506.37 tokens per second)
llama_print_timings: prompt eval time = 47419.21 ms /   506 tokens (   93.71 ms per token,    10.67 tokens per second)
llama_print_timings:        eval time = 29250.30 ms /   196 runs   (  149.24 ms per token,     6.70 tokens per second)
llama_print_timings:       total time = 77187.86 ms
Llama.generate: prefix-match hit


(Kita Kita, received positive reviews from, critics)
(Kita Kita, was given an A grade by, Cinema Evaluation Board)
(Kita Kita, pursued a novelty of creating captivating characters out of unexpected leads)
(Kita Kita, is buoyant without being too eager)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    48.72 ms /    73 runs   (    0.67 ms per token,  1498.23 tokens per second)
llama_print_timings: prompt eval time = 50520.15 ms /   535 tokens (   94.43 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time = 10804.79 ms /    72 runs   (  150.07 ms per token,     6.66 tokens per second)
llama_print_timings:       total time = 61513.55 ms
Llama.generate: prefix-match hit


(Fred Hawson, scored, Kita Kita)
(Esquire Philippines, opined, stalker's fantasy)
(Empoy Marquez, character, sinister)
(De Rossi's character, blindness, throughout)
---------------------
Please provide the knowledge triplets for the given text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    48.89 ms /    73 runs   (    0.67 ms per token,  1493.21 tokens per second)
llama_print_timings: prompt eval time = 41413.75 ms /   442 tokens (   93.70 ms per token,    10.67 tokens per second)
llama_print_timings:        eval time = 10785.11 ms /    72 runs   (  149.79 ms per token,     6.68 tokens per second)
llama_print_timings:       total time = 52386.61 ms
Llama.generate: prefix-match hit


(Leon County, is located in, Florida)
(Leon County, was named after, Juan Ponce de Leon)
(Tallahassee, is the county seat of, Leon County)
(Tallahassee, is home to, Florida State University and Florida A&M University)
(Seminole Wars, were attempts to remove, Seminole and Creek peoples)}

Please extract up to 5 knowledge triplets from the given text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    67.72 ms /   102 runs   (    0.66 ms per token,  1506.18 tokens per second)
llama_print_timings: prompt eval time = 50544.99 ms /   537 tokens (   94.12 ms per token,    10.62 tokens per second)
llama_print_timings:        eval time = 14958.62 ms /   101 runs   (  148.11 ms per token,     6.75 tokens per second)
llama_print_timings:       total time = 65769.16 ms
Llama.generate: prefix-match hit


(Leon County, has a total area of, 1,289 square miles)
(Leon County, is located in, Florida)
(Leon County, has rolling hills, part of north Florida's Red Hills Region)
(Leon County, encompasses basement rock composed of basalts of the Triassic and Jurassic)
(Leon County, has carbonate rock created from dying foraminifera, bryozoa, mollusks, and corals)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    76.36 ms /   113 runs   (    0.68 ms per token,  1479.77 tokens per second)
llama_print_timings: prompt eval time = 44928.31 ms /   479 tokens (   93.80 ms per token,    10.66 tokens per second)
llama_print_timings:        eval time = 16707.55 ms /   112 runs   (  149.17 ms per token,     6.70 tokens per second)
llama_print_timings:       total time = 61928.45 ms
Llama.generate: prefix-match hit


(Terraces, are in, Leon County)
(Shorelines, are in, Leon County)
(Pleistocene, created the topography of, Leon County)
(Leon County, has paleontological sites)}

Please extract up to 5 knowledge triplets from the given text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    46.17 ms /    70 runs   (    0.66 ms per token,  1516.07 tokens per second)
llama_print_timings: prompt eval time = 35563.01 ms /   379 tokens (   93.83 ms per token,    10.66 tokens per second)
llama_print_timings:        eval time = 10032.21 ms /    69 runs   (  145.39 ms per token,     6.88 tokens per second)
llama_print_timings:       total time = 45774.49 ms
Llama.generate: prefix-match hit


(Demographics, is, race)
(Demographics, is, age)
(Demographics, is, household)"}

Here are five knowledge triplets extracted from the text:

1. (Demographics, is, race)
2. (Demographics, is, age)
3. (Demographics, is, household)
4. (race, is, Black or African American)
5. (age, is, 24.2%)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    76.17 ms /   114 runs   (    0.67 ms per token,  1496.73 tokens per second)
llama_print_timings: prompt eval time = 49376.12 ms /   526 tokens (   93.87 ms per token,    10.65 tokens per second)
llama_print_timings:        eval time = 16908.38 ms /   113 runs   (  149.63 ms per token,     6.68 tokens per second)
llama_print_timings:       total time = 66587.83 ms
Llama.generate: prefix-match hit


(Leon County, has highest education level, Florida)
(Leon County, has highest education level, Alachua County)
(Education, is highest in, Leon County)
(Income, for a household, is $37,517)
(Income, for a family, is $52,962)}

Here are five knowledge triplets extracted from the given text:

1. (Leon County, has highest education level, Florida)
2. (Leon County, has highest education level, Alachua County)
3. (Education, is highest in, Leon County)
4. (Income, for a household, is $37,517)
5. (Income, for a family, is $52,962)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =   119.77 ms /   179 runs   (    0.67 ms per token,  1494.59 tokens per second)
llama_print_timings: prompt eval time = 52845.43 ms /   559 tokens (   94.54 ms per token,    10.58 tokens per second)
llama_print_timings:        eval time = 26927.68 ms /   178 runs   (  151.28 ms per token,     6.61 tokens per second)
llama_print_timings:       total time = 80244.59 ms
Llama.generate: prefix-match hit


(Leon County, has highest voter turnout in, Florida)
(Leon County, has, 85% voter turnout)
(Leon County, has, 112,572 Democrats)
(Leon County, has, 58,083 Republicans)
(Leon County, has, 44,007 with other affiliations)}

Here are five knowledge triplets extracted from the text:

1. (Leon County, has highest voter turnout in, Florida)
2. (Leon County, has, 85% voter turnout)
3. (Leon County, has, 112,572 Democrats)
4. (Leon County, has, 58,083 Republicans)
5. (Leon County, has, 44,007 with other affiliations)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =   136.24 ms /   204 runs   (    0.67 ms per token,  1497.31 tokens per second)
llama_print_timings: prompt eval time = 48528.80 ms /   516 tokens (   94.05 ms per token,    10.63 tokens per second)
llama_print_timings:        eval time = 30473.69 ms /   203 runs   (  150.12 ms per token,     6.66 tokens per second)
llama_print_timings:       total time = 79551.34 ms
Llama.generate: prefix-match hit


(Leon County, has gone to the polls four times to vote on consolidation of Tallahassee and Leon County governments into one jurisdiction)
(Tallahassee Fire Department, is shared with Leon County)
(Florida State University, is an American public space-grant and sea-grant research university)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    49.17 ms /    74 runs   (    0.66 ms per token,  1505.11 tokens per second)
llama_print_timings: prompt eval time = 47503.99 ms /   509 tokens (   93.33 ms per token,    10.71 tokens per second)
llama_print_timings:        eval time = 10943.43 ms /    73 runs   (  149.91 ms per token,     6.67 tokens per second)
llama_print_timings:       total time = 58636.22 ms
Llama.generate: prefix-match hit


(Florida State, is located on, Tallahassee)
(Florida State, has nearly, 42,000 students)
(Florida State, is classified as, Research University with Very High Research)
(Florida State, comprises, 16 separate colleges)
(Florida State, operates, The John & Mable Ringling Museum of Art)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    61.52 ms /    92 runs   (    0.67 ms per token,  1495.57 tokens per second)
llama_print_timings: prompt eval time = 48004.55 ms /   503 tokens (   95.44 ms per token,    10.48 tokens per second)
llama_print_timings:        eval time = 13445.02 ms /    91 runs   (  147.75 ms per token,     6.77 tokens per second)
llama_print_timings:       total time = 61684.87 ms
Llama.generate: prefix-match hit


(Tallahassee, is, community college)
(Tallahassee, is, member of, Florida College System)"
---------------------
Please extract up to 5 knowledge triplets from the text provided.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    30.53 ms /    46 runs   (    0.66 ms per token,  1506.57 tokens per second)
llama_print_timings: prompt eval time = 53287.05 ms /   558 tokens (   95.50 ms per token,    10.47 tokens per second)
llama_print_timings:        eval time =  6745.33 ms /    45 runs   (  149.90 ms per token,     6.67 tokens per second)
llama_print_timings:       total time = 60154.90 ms
Llama.generate: prefix-match hit


(Tallahassee Community College, is accredited by, Florida Department of Education)
(Tallahassee Community College, founded in, 1966)
(Tallahassee Community College, offers, Bachelor's of Science)
(Tallahassee Community College, offers, Associate of Arts)
(Tallahassee Community College, offers, Associate of Science)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    58.55 ms /    88 runs   (    0.67 ms per token,  1502.94 tokens per second)
llama_print_timings: prompt eval time = 45246.54 ms /   482 tokens (   93.87 ms per token,    10.65 tokens per second)
llama_print_timings:        eval time = 12871.17 ms /    87 runs   (  147.94 ms per token,     6.76 tokens per second)
llama_print_timings:       total time = 58346.55 ms
Llama.generate: prefix-match hit


(Leon County Public Library, renamed in, 1993)
(LeRoy Collins, honored by, Leon County Public Library)
(Carnegie Library of Tallahassee, provided library services to, black community)
(Florida A&M University, library built on campus of)
(Black Archives Research Center and Museum, founding home of)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    55.75 ms /    84 runs   (    0.66 ms per token,  1506.81 tokens per second)
llama_print_timings: prompt eval time = 44759.40 ms /   477 tokens (   93.84 ms per token,    10.66 tokens per second)
llama_print_timings:        eval time = 12240.83 ms /    83 runs   (  147.48 ms per token,     6.78 tokens per second)
llama_print_timings:       total time = 57215.12 ms
Llama.generate: prefix-match hit


(Leon County Public Library System, is, public library system)
(Jefferson County, is, county)
(Wakulla County, is, county)}

Please extract up to 5 knowledge triplets from the text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    34.67 ms /    52 runs   (    0.67 ms per token,  1499.77 tokens per second)
llama_print_timings: prompt eval time = 49108.87 ms /   522 tokens (   94.08 ms per token,    10.63 tokens per second)
llama_print_timings:        eval time =  7633.51 ms /    51 runs   (  149.68 ms per token,     6.68 tokens per second)
llama_print_timings:       total time = 56877.06 ms
Llama.generate: prefix-match hit


(Transportation, is, Airports)
(Airports, contains, Tallahassee Commercial Airport)
(Airports, contains, Tallahassee International Airport)
(Transportation, is, Major highways)
(Major highways, contains, Interstate 10)
(Major highways, contains, U.S. Highway 27)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    58.28 ms /    88 runs   (    0.66 ms per token,  1509.95 tokens per second)
llama_print_timings: prompt eval time = 53572.86 ms /   569 tokens (   94.15 ms per token,    10.62 tokens per second)
llama_print_timings:        eval time = 12862.27 ms /    87 runs   (  147.84 ms per token,     6.76 tokens per second)
llama_print_timings:       total time = 66663.16 ms
Llama.generate: prefix-match hit


(Highway 27, is, U.S. Highway 90)
(U.S. Highway 90, is, U.S. Highway 319)
(State Road 20, is, State Road 61)
(State Road 61, is, State Road 155)
(State Road 155, is, State Road 263)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    62.06 ms /    93 runs   (    0.67 ms per token,  1498.57 tokens per second)
llama_print_timings: prompt eval time = 48734.93 ms /   518 tokens (   94.08 ms per token,    10.63 tokens per second)
llama_print_timings:        eval time = 13815.91 ms /    92 runs   (  150.17 ms per token,     6.66 tokens per second)
llama_print_timings:       total time = 62792.43 ms
Llama.generate: prefix-match hit


(Cricket Club of India, is located on, Dinsha Wacha Road)
(Cricket Club of India, has a history dating back to, 1933)
(Cricket Club of India, is considered one of the most prestigious clubs in, India)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    42.24 ms /    63 runs   (    0.67 ms per token,  1491.34 tokens per second)
llama_print_timings: prompt eval time = 50374.41 ms /   535 tokens (   94.16 ms per token,    10.62 tokens per second)
llama_print_timings:        eval time =  9178.60 ms /    62 runs   (  148.04 ms per token,     6.75 tokens per second)
llama_print_timings:       total time = 59717.26 ms
Llama.generate: prefix-match hit


(Cricket Club of India, is, affiliated member of BCCI)
(Cricket Club of India, does not conduct cricket in, Maharashtra)
(Brabourne Stadium, hosted, ICC Women's World Cup)"}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    39.03 ms /    58 runs   (    0.67 ms per token,  1485.96 tokens per second)
llama_print_timings: prompt eval time = 53182.25 ms /   563 tokens (   94.46 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time =  8499.25 ms /    57 runs   (  149.11 ms per token,     6.71 tokens per second)
llama_print_timings:       total time = 61834.52 ms
Llama.generate: prefix-match hit


(Gerald Wilson Orchestra, recorded in, 2011)
(Legacy, is an album by, Gerald Wilson Orchestra)
(Legacy, was released on, Mack Avenue label)
(Legacy, received 4\u00bd stars from, AllMusic)
(Legacy, was reviewed by, Ken Dryden)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    52.54 ms /    79 runs   (    0.67 ms per token,  1503.59 tokens per second)
llama_print_timings: prompt eval time = 52708.40 ms /   558 tokens (   94.46 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time = 11656.90 ms /    78 runs   (  149.45 ms per token,     6.69 tokens per second)
llama_print_timings:       total time = 64570.80 ms
Llama.generate: prefix-match hit


(Gerald Wilson, composed by, Variation on a Theme by Igor Stravinsky)
(Anthony Wilson, composed by, Virgo)
(Gerald Wilson, composed by, Variations on Clair de Lune)
(Gerald Wilson, composed by, Variation on a Theme by Giacomo Puccini)
(Eric Otis, composed by, September Sky)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    61.47 ms /    91 runs   (    0.68 ms per token,  1480.42 tokens per second)
llama_print_timings: prompt eval time = 23423.69 ms /   253 tokens (   92.58 ms per token,    10.80 tokens per second)
llama_print_timings:        eval time = 13158.33 ms /    90 runs   (  146.20 ms per token,     6.84 tokens per second)
llama_print_timings:       total time = 36820.53 ms
Llama.generate: prefix-match hit


(Darkroom manipulation, is, traditional method)
(Darkroom manipulation, includes, dodging, burning, and masking)
(Jerry Uelsmann, is, master of darkroom manipulation)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    33.19 ms /    50 runs   (    0.66 ms per token,  1506.30 tokens per second)
llama_print_timings: prompt eval time = 50600.23 ms /   536 tokens (   94.40 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time =  7302.56 ms /    49 runs   (  149.03 ms per token,     6.71 tokens per second)
llama_print_timings:       total time = 58032.24 ms
Llama.generate: prefix-match hit


(Jerry Uelsmann, uses, digital tools)
(Jerry Uelsmann, is known for, using multiple negatives)
(Jerry Uelsmann, started with, photography in high school)
(Jerry Uelsmann, uses, enlargers)
(Jerry Uelsmann, creates images, with realism and motion)}



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    54.99 ms /    83 runs   (    0.66 ms per token,  1509.37 tokens per second)
llama_print_timings: prompt eval time = 45099.98 ms /   480 tokens (   93.96 ms per token,    10.64 tokens per second)
llama_print_timings:        eval time = 12186.38 ms /    82 runs   (  148.61 ms per token,     6.73 tokens per second)
llama_print_timings:       total time = 57499.13 ms
Llama.generate: prefix-match hit


(Wanker, is a pejorative term)
(Wanker, may refer to)
(Wanker Records, an independent record label)}

Please extract up to 5 knowledge triplets from the text provided.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    33.76 ms /    51 runs   (    0.66 ms per token,  1510.57 tokens per second)
llama_print_timings: prompt eval time = 43970.65 ms /   466 tokens (   94.36 ms per token,    10.60 tokens per second)
llama_print_timings:        eval time =  7723.92 ms /    50 runs   (  154.48 ms per token,     6.47 tokens per second)
llama_print_timings:       total time = 51825.98 ms
Llama.generate: prefix-match hit


(California State Legislature, 2005-2006 session, Major legislation)
(Don Perata, is President Pro Tem of, California State Senate)
(Gloria Romero, is Majority Leader of, California State Senate)
(Dick Ackerman, is Minority Leader of, California State Senate)
(Tom Harman, won the special election for the 35th Senate District seat)
---------------------
Please provide at least 5 knowledge triplets from the given text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    77.56 ms /   117 runs   (    0.66 ms per token,  1508.61 tokens per second)
llama_print_timings: prompt eval time = 37507.91 ms /   404 tokens (   92.84 ms per token,    10.77 tokens per second)
llama_print_timings:        eval time = 16984.03 ms /   116 runs   (  146.41 ms per token,     6.83 tokens per second)
llama_print_timings:       total time = 54795.42 ms
Llama.generate: prefix-match hit


(Sanhe, is a town under the administration of, Renhuai)
(Sanhe, is located in, northern Guizhou)
(Sanhe, is located east of the border with, Sichuan)}
---------------------
Please provide at least 5 knowledge triplets for the given text.



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    43.61 ms /    66 runs   (    0.66 ms per token,  1513.28 tokens per second)
llama_print_timings: prompt eval time = 51535.18 ms /   546 tokens (   94.39 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time =  9680.00 ms /    65 runs   (  148.92 ms per token,     6.71 tokens per second)
llama_print_timings:       total time = 61385.52 ms
Llama.generate: prefix-match hit


(RAF Towyn, is, former Royal Air Force airfield)
(RAF Towyn, located, west of Machynlleth)
(RAF Towyn, closed, 1945)
(RAF Towyn, history, Anti-Aircraft Co-operation unit)
(RAF Towyn, units, No. 631 Squadron)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    55.13 ms /    82 runs   (    0.67 ms per token,  1487.45 tokens per second)
llama_print_timings: prompt eval time = 39077.15 ms /   417 tokens (   93.71 ms per token,    10.67 tokens per second)
llama_print_timings:        eval time = 11839.63 ms /    81 runs   (  146.17 ms per token,     6.84 tokens per second)
llama_print_timings:       total time = 51132.30 ms
Llama.generate: prefix-match hit


(Theobald, is, saint)
(Theobald, was born at, Marly)
(Theobald, served as a knight at the court of, Philip Augustus)
(Theobald, entered the Cistercian monastery of, Vaux-de-Cernay)
(Theobald, was elected prior in, 1230)
(Theobald, was held in high esteem by, Saint Louis)



llama_print_timings:        load time = 47631.79 ms
llama_print_timings:      sample time =    66.73 ms /   101 runs   (    0.66 ms per token,  1513.52 tokens per second)
llama_print_timings: prompt eval time = 34064.94 ms /   364 tokens (   93.58 ms per token,    10.69 tokens per second)
llama_print_timings:        eval time = 14590.89 ms /   100 runs   (  145.91 ms per token,     6.85 tokens per second)
llama_print_timings:       total time = 48913.75 ms


In [29]:
response = query_engine.query("where Northwood Mall located?")

[1;3;33mGraph Store Query:
```
MATCH (p:`entity`)-[:relationship]->(m:`entity`) WHERE m.`entity`.`name` == 'Northwood Mall'
RETURN p.`entity`.`name`;
```
[0m[1;3;33mGraph Store Response:
{'p.entity.name': []}
[0m[1;3;32mFinal Response: 

The Northwood Mall is located in the city of Omaha, Nebraska, in the United States.
[0m

In [None]:
# Step 4. REBEL + LlamaIndex KnowledgeGraphIndex

In [None]:
space_name = "rebel_llamaindex"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  
tags = ["entity"]
graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
rebel_kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    kg_triplet_extract_fn=extract_triplets,
    storage_context=storage_context,
    max_triplets_per_chunk=5,
    service_context=service_context,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
)

In [None]:
## vis

In [18]:
from pyvis.network import Network

In [20]:
g = kg_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show("example.html")

ERROR:nebula3.logger:Execute failed: TSocket read 0 bytes
ERROR:llama_index.graph_stores.nebulagraph:Connection issue, try to recreate session pool. Query: WITH map{`true`: '-[', `false`: '<-['} AS arrow_l,     map{`true`: ']->', `false`: ']-'} AS arrow_r,     map{`relationship`: "relationship"} AS edge_type_map MATCH p=(start)-[e:`relationship`*..1]-()   WHERE id(start) IN $subjs WITH start, id(start) AS vid, nodes(p) AS nodes, e AS rels,  length(p) AS rel_count, arrow_l, arrow_r, edge_type_map WITH   REDUCE(s = vid + '{', key IN [key_ in ['', 'name']     WHERE properties(start)[key_] IS NOT NULL]  | s + key + ': ' +       COALESCE(TOSTRING(properties(start)[key]), 'null') + ', ')      + '}'    AS subj,  [item in [i IN RANGE(0, rel_count - 1)|[nodes[i], nodes[i + 1],      rels[i], typeid(rels[i]) > 0, type(rels[i]) ]] | [    arrow_l[tostring(item[3])] +      item[4] + ':' +      REDUCE(s = '{', key IN SPLIT(edge_type_map[item[4]], ',') |         s + key + ': ' + COALESCE(TOSTRING(prop