In [None]:
# Verify weviate-client is installed and the database is live and ready
import weaviate
client = weaviate.Client("http://localhost:8080")
assert client.is_live()
assert client.is_ready()
client.get_meta()

In [None]:
# !!!! Delete data in Weaviate
client.schema.delete_class("Wiki_Node")

In [None]:
import html2text
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True

def wiki_to_text(ancestors, title, labels, body):
    body_text = text_maker.handle(body)
    text =  f"Title: {title}\n"
    if ancestors: text += f"Ancestors: {ancestors}\n" 
    if labels: text += f"Labels: {ancestors}\n"
    text += f"{body_text}"
    return text


In [None]:
# Walk cached Wiki pages
import os
import re
n = 0
for root, dirs, files in os.walk("../data/wiki"):
    for name in files:
        filepath = os.path.join(root, name)
        with open(filepath) as f:
            link = f.readline().rstrip()
            ancestors = f.readline().rstrip()
            title = f.readline().rstrip()
            labels = f.readline().rstrip()
            body = re.sub('[\n]+', '\n', "".join(f.readlines()))
            text = wiki_to_text(ancestors, title, labels, body)
            if n<1:
                print(name)
                print(link)
                print(text)
            n += 1


In [None]:
# Create Documents from cached wiki files
from llama_index.legacy import Document
documents = []
for root, dirs, files in os.walk("../data/wiki"):
    for name in files:
        filepath = os.path.join(root, name)
        with open(filepath) as f:
            link = f.readline().rstrip()
            ancestors = f.readline().rstrip()
            title = f.readline().rstrip()
            labels = f.readline().rstrip()
            body = re.sub('[\n]+', '\n', "".join(f.readlines()))
            text = wiki_to_text(ancestors, title, labels, body)
            doc = Document(text, doc_id=name, extra_info={"title": title, "link": link})
            documents.append(doc)

print(f"Loaded {len(documents)} documents")

In [None]:
documents[40]

In [None]:
from llama_index.legacy import LLMPredictor, PromptHelper, ServiceContext
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index.legacy import LangchainEmbedding

llm = ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo-0301")
llm_predictor = LLMPredictor(llm=llm)
embed_model = LangchainEmbedding(OpenAIEmbeddings())

max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper)

In [None]:
# Calculate embedding for all of the documents and save them into Weaviate
from llama_index.legacy import GPTVectorStoreIndex
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.storage.storage_context import StorageContext

class_prefix = "Wiki"
vector_store = WeaviateVectorStore(weaviate_client=client, class_prefix=class_prefix)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# persists the vector_store into Weaviate
index = GPTVectorStoreIndex.from_documents(documents, storage_context=storage_context, service_context=service_context)

# persist the docstore and index_store
# this is currently required although in theory Weaviate should be able to handle these as well
storage_context.persist(persist_dir='../storage/index')

In [None]:
def get_unique_nodes(nodes):
    docs_ids = set()
    unique_nodes = list()
    for node in nodes:
        if node.node.ref_doc_id not in docs_ids:
            docs_ids.add(node.node.ref_doc_id)
            unique_nodes.append(node)
    return unique_nodes
        
def print_response(response):
    print(response.response)    
    for node in get_unique_nodes(response.source_nodes):
        print(f"{node.node.extra_info['title']}")
        print(f"\t{node.node.extra_info['link']}")
        
def query(question, n=5):   
    query_engine = index.as_query_engine(similarity_top_k=n)
    res = query_engine.query(question)
    print_response(res)
        

In [None]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.vector_stores.types import VectorStoreQueryMode
from llama_index.legacy import ResponseSynthesizer
from llama_index.query_engine import RetrieverQueryEngine

# configure retriever
retriever = VectorIndexRetriever(
    index,
    similarity_top_k=5,
    vector_store_query_mode=VectorStoreQueryMode.HYBRID,
    alpha=0.7,
)

# configure response synthesizer
synth = ResponseSynthesizer.from_args()

# construct query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=synth,
)

query("On what physical server is Nextflow Tower installed, and where will it be moving to?")

In [None]:
query("What interest groups does Scientific Computing sponsor?")