In [None]:
!pip -q install langchain sentence_transformers InstructorEmbedding pandas faiss-gpu jq

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader

In [None]:
# InstructorEmbedding
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
from pprint import pprint

In [None]:
# load files

loader = JSONLoader(
    file_path='./val_webmd_squad_v2_consec_parsed.json',
    jq_schema='.qas[].context',
    text_content=False)
documents = loader.load()

In [None]:
# pprint(documents)

In [None]:
# divide into chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [None]:
texts[12]

Document(page_content="including prescription and over-the-counter drugs, herbal remedies, supplements, and vitamins. Facts about your symptoms. When did they start? Did they come on slow or fast? Does it happen every time you want to have sex? Is it random? Is it only under certain circumstances? Key personal information. Are you going through a stressful time? Have there been any major changes at home or work? Are you drinking heavily, or using cocaine, cigarettes, or opioids? Think about asking your partner to come along. Your partner can fill in details that you may forget or may not have thought of. You'll want answers to these questions before you leave: What's causing my ED? Are my symptoms long-term or temporary? Can my ED be treated? What are the treatment options? What if they don't work? Will I need to see a specialist? What will my insurance cover? Can I make lifestyle changes that will help? Where I can get more information? Worried about discussing your problem? The best 

In [None]:
texts[0].page_content

"If it's temporary and only happens occasionally, problems getting or keeping an erection aren't cause for concern. There could be any number of reasons. It could be fatigue, stress, drinking alcohol, or even side effects of a medicine you just started taking. But some men have a more frequent, longer lasting problem called erectile dysfunction ( ED). It's more common in older men, but aging isn't the cause. In nearly 75% of ED cases, there's a physical cause. That means it's time to see your doctor. There are three main reasons you shouldn't try to deal with erectile dysfunction on your own: It can be treated: Sometimes, it's as simple as taking a pill your doctor prescribes. There are drugs just for ED. Other options your doctor can help you explore include: Injections Suppositories Surgical penile implants Special devices, like vacuum pumps, which boost blood flow to the penis It can be linked to more serious health conditions like: High blood pressure Hardening of the arteries"

In [None]:
# get embeddings

import pickle
import faiss
from langchain.vectorstores import FAISS
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
text_corpus = [t.page_content for t in texts]

def store_embeddings(docs, embeddings, store_name, path):

    # vectorStore = FAISS.from_documents(docs, embeddings)

    # create vectors from text
    vectors = encoder.encode(text_corpus)

    #build faiss index from vectors
    vector_dimension = len(vectors[0])
    index = faiss.IndexFlatL2(vector_dimension)
    faiss.normalize_L2(vectors)
    index.add(vectors)


    with open(f"{path}/faiss_{store_name}.pkl", "wb") as f:
        pickle.dump(index, f)

def load_embeddings(store_name, path):
    with open(f"{path}/faiss_{store_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

def getContext(query):
    # get vector from query
    search_vector = encoder.encode(query)
    faiss.normalize_L2(search_vector)

    # retrieve stored vectors
    index = load_embeddings(store_name='instructEmbeddings', path="./embedStore")

    # search and sort results
    k = index.ntotal
    distances, ann = index.search(search_vector, k=k)
    results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
    merge = pd.merge(results, text_corpus, left_on='ann', right_index=True)
    # print(pd.merge_ordered)
    return text_corpus[ann[0][0]]

In [None]:
# hugging face instructor encodings

# from langchain.embeddings import HuggingFaceInstructEmbeddings

# instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"})

In [None]:
# store_embeddings(texts, instructor_embeddings, store_name='instructEmbeddings', path="./embedStore")
store_embeddings(texts, None, store_name='instructEmbeddings', path="./embedStore")

KeyboardInterrupt: ignored

In [None]:
# db_instructEmbedd = load_embeddings(store_name='instructEmbeddings', path="./embedStore")
getContext("cancer pneumonia")

In [None]:
# db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
# retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 1})

# to get context, call retriever.get_relevant_documents(query)
# where query is the question string