In [3]:
import pandas as pd

df = pd.read_csv('./biolincc_deduplabelsclean.tsv', sep='\t')

In [4]:
df.head()

Unnamed: 0,label_clean,UID
0,!ADVERSE EVENT DRUG INTERVENTION OPTION!,1
1,!ADVERSE EVENT DRUG THERAPY INTERVENTION!,2
2,!ADVERSE EVENT NEURO CVA!,3
3,!ADVERSE EVENT NEURO ENCEPHALOPATHY!,4
4,!ADVERSE EVENT NEURO SEIZURE!,5


In [5]:
df.describe()

Unnamed: 0,UID
count,259880.0
mean,129940.5
std,75021.038316
min,1.0
25%,64970.75
50%,129940.5
75%,194910.25
max,259880.0


In [6]:
for row in df.itertuples():
    print(row)
    break

Pandas(Index=0, label_clean=' !ADVERSE EVENT DRUG INTERVENTION OPTION! ', UID=1)


In [9]:
from langchain.docstore.document import Document

docs = []

for row in df.itertuples():
    # Example output Pandas(Index=0, label_clean=' !ADVERSE EVENT DRUG INTERVENTION OPTION! ', UID=1)
    doc = Document(
        page_content=row[1].strip(),  # replace "row['Label']" with "row[1]"
        metadata={
            "uid": row[2],  # replace "row['Study']" with "row[2]"
            "index": row[0],  # replace "row['Index']" with "row[1]"
            "text": row[1].strip()  # replace "row['Label']" with "row[1]"
        }
    )
    docs.append(doc)

print(docs[0])


page_content='!ADVERSE EVENT DRUG INTERVENTION OPTION!' metadata={'uid': 1, 'index': 0, 'text': '!ADVERSE EVENT DRUG INTERVENTION OPTION!'}


## Embedding and Vector DB Setup

Initialize our embedding model:

In [10]:
import os
from getpass import getpass
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

# get openai api key from platform.openai.com
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or getpass("OpenAI API Key: ")
embed = OpenAIEmbeddings(
    model=model_name, openai_api_key=OPENAI_API_KEY, disallowed_special=()
)

Now we create our vector DB to store our vectors. For this we need to get a [free Pinecone API key](https://app.pinecone.io) — the API key can be found in the "API Keys" button found in the left navbar of the Pinecone dashboard.

In [11]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")

# configure client
pc = Pinecone(api_key=api_key)

  from tqdm.autonotebook import tqdm


Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects).

In [12]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-west-2"
)

Creating an index, we set `dimension` equal to to dimensionality of Ada-002 (`1536`), and use a `metric` also compatible with Ada-002 (this can be either `cosine` or `dotproduct`). We also pass our `spec` to index initialization.

In [15]:
import time

index_name = "biolincc-labels-001"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='euclidean',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Populate our index:

In [13]:
len(docs)

259880

In [8]:
# if you want to speed things up to follow along
#docs = docs[:5000]

In [16]:
from tqdm.auto import tqdm

batch_size = 100

for i in tqdm(range(0, len(docs), batch_size)):
    i_end = min(len(docs), i+batch_size)
    docs_batch = docs[i:i_end]
    # get IDs
    ids = [f"{doc.metadata['uid']}" for doc in docs_batch]
    # get text and embed
    texts = [d.page_content for d in docs_batch]
    embeds = embed.embed_documents(texts=texts)
    # get metadata
    metadata = [d.metadata for d in docs_batch]
    to_upsert = zip(ids, embeds, metadata)
    index.upsert(vectors=to_upsert)

100%|██████████| 2599/2599 [1:37:23<00:00,  2.25s/it]
