In [16]:
import sys
sys.path.append("..")

import json

import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models

from tqdm.notebook import tqdm

from config import QDRANT_HOST, QDRANT_PORT, QDRANT_API_KEY, OPENAI_API_KEY, DATA, COLLECTION_NAME


## Connect to Qdrant and create collection


In [17]:
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
# client.recreate_collection(
#     collection_name=COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
# )


## Load Data into DataFrame


In [None]:
with open(f"{DATA}/processed/articles.json", "r") as file:
    articles_json = json.load(file)

rows = []
for paper in tqdm(articles_json): 
    rows.append(
        (
            paper["title"],
            paper["link"],
            paper["abstract"],
            paper["body"],
        )
    )

df = pd.DataFrame(data=rows, columns=["title", "link", "abstract", "body"])

df = df[df["body"].str.split().str.len() > 15]


# Instanciate Model

In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "msmarco-MiniLM-L-6-v3",
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu",
)

## Vectorize docs


In [None]:
vectors = []

for doc in tqdm(df["body"].to_list()):
    vectors.append(model.encode(doc))

In [7]:
print(len(vectors))

1103


In [None]:
nsplit = 100
batch_load_step = int(df.shape[0]/nsplit)

for index in list(range(nsplit)):
    print(vectors[batch_load_step*index:batch_load_step*(index+1)])

In [4]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='papers')])

In [72]:
client.delete_collection(collection_name="papers")

True

In [73]:
client.recreate_collection(
    collection_name="papers",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

True

## Load data on Qdrant search engine

In [None]:
for index in list(range(nsplit)):
    for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows():
        print(row['title'])

In [None]:
for vec in vectors:
    print((list(vec)))

In [None]:
for index in tqdm(list(range(nsplit))):
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=models.Batch(
            ids=list(range(batch_load_step*index, batch_load_step*(index+1))),
            payloads=[
                {
                    "body": row["body"],
                    "abstract": row["abstract"],
                    "title": row["title"],
                    "link": row["link"],
                }
                for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows()
            ],
            vectors= [vec.tolist() for vec in vectors[batch_load_step*index:batch_load_step*(index+1)]],
        ),
    )

In [19]:
query = "how is artificial intelligence used in medical applications?"
%store query

Stored 'query' (str)


In [20]:
similar_docs = client.search(
        collection_name="papers",
        query_vector=model.encode(query),
        limit=6,
        append_payload=True,
    )

%store similar_docs

Stored 'similar_docs' (list)


In [21]:
for doc in similar_docs: 
    print("[" + str(doc.score) + '] ' + doc.payload['title'])

[0.6329287] Research article
Physicians’ attitudes and knowledge toward artificial intelligence in medicine: Benefits and drawbacks
[0.5812633] Knowledge, attitudes and practices towards artificial intelligence (AI) among radiologists in Saudi Arabia
[0.5647462] Patient views on the implementation of artificial intelligence in radiotherapy
[0.55050915] Case of the Season: Artificial Intelligence in Clinical Practice—Large Vessel Occlusion Triage in Stroke Imaging
[0.5367967] Applications of Artificial Intelligence in the Radiology Roundtrip: Process Streamlining, Workflow Optimization, and Beyond
[0.5233896] Research article
An architectural approach to modeling artificial general intelligence
