In [1]:
import sys
sys.path.append("..")

import json

import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models

from tqdm.notebook import tqdm

from config import QDRANT_HOST, QDRANT_PORT, QDRANT_API_KEY, OPENAI_API_KEY, DATA, COLLECTION_NAME


## Connect to Qdrant and create collection


In [12]:
qdrantClient = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
qdrantClient.recreate_collection(
    collection_name="paper_test_extracted",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)


True

## Load Data into DataFrame


In [3]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.scientific_articles
articles_collection = db.articles

articles_json = list(articles_collection.find({"sentences": {"$exists": True}}))


rows = []
for paper in tqdm(articles_json):
    for sentence in paper['sentences']:
        rows.append(
            (
                paper["title"],
                paper["link"],
                sentence
            )
        )

df = pd.DataFrame(data=rows, columns=["title", "link", "sentence"])

  0%|          | 0/115 [00:00<?, ?it/s]

In [6]:
df[0:10]

Unnamed: 0,title,link,sentence
0,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The skin plays an important role in detecting...
1,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The all-in-one smart wound dressings integrat...
2,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,Local vasodilatation and angiogenesis can incr...
3,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,Oxygen is necessary for wound healing as it s...
4,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The color change of the pH sensors was acquir...
5,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,A novel bacterial self-reporting platform was...
6,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The pH data in their study could not be wirel...
7,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The reported smart dressing could monitor the...
8,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,The long-term continuous monitoring of wound ...
9,Smart wound dressing for advanced wound manage...,https://www.sciencedirect.com/science/article/...,W wound dressing can release drugs or deliver ...


# Instanciate Model

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "msmarco-MiniLM-L-6-v3",
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu",
)

## Vectorize docs


In [8]:
vectors = []

for doc in tqdm(df["sentence"].to_list()):
    vectors.append(model.encode(doc))

  0%|          | 0/797 [00:00<?, ?it/s]

In [9]:
print(len(vectors))

797


In [None]:
nsplit = 100
batch_load_step = int(df.shape[0]/nsplit)

for index in list(range(nsplit)):
    print(vectors[batch_load_step*index:batch_load_step*(index+1)])

In [13]:
qdrantClient.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='papers'), CollectionDescription(name='paper_test_extracted')])

In [72]:
qdrantClient.delete_collection(collection_name="papers")

True

In [73]:
client.recreate_collection(
    collection_name="papers",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

True

## Load data on Qdrant search engine

In [None]:
for index in list(range(nsplit)):
    for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows():
        print(row['title'])

In [None]:
for vec in vectors:
    print((list(vec)))

In [18]:
for index in tqdm(list(range(nsplit))):
    qdrantClient.upsert(
        collection_name="paper_test_extracted",
        points=models.Batch(
            ids=list(range(batch_load_step*index, batch_load_step*(index+1))),
            payloads=[
                {
                    "link": row["link"],
                    "title": row["title"],
                    "sentence": row["sentence"],
                }
                for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows()
            ],
            vectors= [vec.tolist() for vec in vectors[batch_load_step*index:batch_load_step*(index+1)]],
        ),
    )

  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
query = "methods for artificial intelligence image processing"
%store query

Stored 'query' (str)


In [36]:
similar_docs = qdrantClient.search(
        collection_name="paper_test_extracted",
        query_vector=model.encode(query),
        limit=20,
        append_payload=True,
    )

%store similar_docs

Stored 'similar_docs' (list)


In [37]:
for doc in similar_docs: 
    print("[" + str(doc.score) + '] ' + doc.payload['sentence'])

[0.5954349]  Researchers are actively using AI techniques for data leakage prevention, intelligent e-mail protection, malicious domain blocking and reporting to ensure data confidentiality, integrity and availability. Data leakage prevention deals with the detection and protection of data breaches, exfiltration or the unwanted destruction of data. AI techniques are used for monitoring data access, data movement and user activity
[0.5064325]  The use of artificial intelligence (AI) in medical imaging is becoming increasingly popular as the technology improves. In the United States, under the regulatory authority of the Food and Drug Administration (FDA) every clinical AI software system is regulated as a medical device
[0.49403715]  The effect-visualizations of each parameter by SHAP, an AI model analysis tool, on the ANN114 data-driven model, can be explained by the relatively slower electrochemical reaction rate. The multi-objective optimization process was conducted by PSO with a com