In [None]:
!pip install faiss-cpu transformers datasets sentence-transformers langchain langchain-community langchain-openai


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-com

In [None]:
import os
from uuid import uuid4
import faiss
from datasets import load_dataset
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
import numpy as np
import pandas as pd


In [None]:
dataset = load_dataset("TachyHealth/International_Classification_Diseases_Clinical_Modification_icd10cm_order_April_2024", split="train")

model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5",
    trust_remote_code=True,
    device="cuda",
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
)

# # Подготовка данных для Langchain (весь датасет)
documents = []
uuids = []
embeddings = []

print("=== Соберем доки для эмбедера ===")

# Соберем доки для эмбедера

for i, row in enumerate(dataset):
    documents.append(Document(page_content=row['Short Description'], metadata={'code': row['Code']}))
    uuids.append(str(uuid4()))
    embeddings.append(model.encode(documents[-1].page_content))

embeddings = np.array(embeddings).astype('float32')

# # размерность эмбедингов
# # dimension = len(embeddings[0])
dimension = 1024

# # Квантование FAISS (важно для больших датасетов)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 1000, faiss.METRIC_L2)  # Увеличили число центроидов

# print("=== Тренируем индекс ===")

# # тренеруем индекс
# # The IndexIVFFlat requires training to determine the cluster centers for its inverted file structure. This training is done using a representative set of vectors.
res = faiss.StandardGpuResources()  # Initialize GPU resources
index = faiss.IndexFlatL2(dimension)
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index.train(embeddings)

# # Инициализация FAISS vector store
vector_store2 = FAISS(
    embedding_function=model.encode,
    index=gpu_index,
    docstore=InMemoryDocstore({}),
    index_to_docstore_id={}
)
print("=== Добавим доки ===")
# Добавление документов (может занять много времени и памяти)
vector_store2.add_documents(documents=documents, ids=uuids)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=== Соберем доки для эмбедера ===




=== Добавим доки ===


['0ed49bbf-a1c3-4000-b47d-11815ddb9772',
 'c7c13f02-8832-411b-bbd9-bc109c0c019c',
 '362630ce-00ad-4cb5-ae4a-dbdc000671ba',
 'ac974a39-7a0e-4e59-97eb-f94e7f3cb5ca',
 '465fc405-fb9b-4d8b-807a-def0204e6436',
 '8d97d869-31a9-4c59-8f4b-330cc4479875',
 'd2c4f624-4174-4ea4-9f83-989251eb8494',
 'b3397d8f-3fde-4de2-aadb-66442935ff6d',
 'bd3e9072-0c8b-4f44-9526-966c9e8dd372',
 '855c97aa-64bb-4293-8c64-ee8daad8358a',
 'b3ce8e90-a8ca-4669-9954-9b4b252712c6',
 '643e5719-19af-4035-84aa-341466b0bc8c',
 'a86ec7b3-3a2c-458b-beaa-b26b72846e8b',
 'ed934c82-8813-47ad-a39a-384ba7d87f6f',
 '1cae3bf7-a0af-4a89-8c0d-5a4196e6820c',
 '81715769-9881-4ed0-a931-b3f536fe0a20',
 '087776d6-2248-4ac4-a87b-5932208b6c5d',
 '7f654568-4085-4797-9638-ae29be20edd4',
 'd1626948-de15-410c-8751-87d8193cb7ab',
 'abe0cf2f-4037-47ed-85e6-38000eb1803e',
 'b02de86c-ea0e-47f4-8526-256df890916c',
 '011bee93-91eb-4cf5-a1f1-dee22affe421',
 '6451ef0e-e170-40e4-8185-6e337342ab26',
 'f12a60ee-679d-493c-b298-83c2f1e88629',
 '92e3db2f-5de6-

In [None]:
index_cpu = faiss.index_gpu_to_cpu(gpu_index) # Transfer the index to CPU
vector_store2.index = index_cpu # Update the vector store with the CPU index

vector_store2.save_local("faiss_index")

In [None]:
faiss.write_index(index_cpu, "flat.index")

In [None]:
model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5",
    trust_remote_code=True,
    device="cpu",
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
)

db = FAISS.load_local("/content/index_faiss", model, allow_dangerous_deserialization=True)

query = "What is the ICD-code for Cholera?"
# Convert the query to embedding before searching
query_embedding = model.encode(query)

res = db.similarity_search_with_score_by_vector(query_embedding, k=1)
ress = [r for r, _ in res]
for k in ress:
    print(f"For {k.page_content} code is: {k.metadata['code']}")





Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


For Cholera due to Vibrio cholerae 01, biovar cholerae code is: A000
