In [2]:
!pip install -q langchain
!pip install -q jq
!pip install -q chromadb
!pip install -q sentence_transformers

In [3]:
import os, json
from typing import Any, List, Mapping, Optional

import torch
import torch.nn as nn

import pandas as pd

from transformers import AutoTokenizer, RobertaModel, RobertaConfig

from langchain import PromptTemplate, LLMChain
from langchain.chat_models import ChatOpenAI

from langchain.chat_models import ChatOpenAI

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

from langchain.document_loaders import csv_loader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [5]:
from langchain.document_loaders import JSONLoader
from tqdm.auto import tqdm

In [6]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = text_splitter.split_documents(text)
    return chunks

In [7]:
train_path = "/content/drive/MyDrive/Colab_Notebooks/NLP/bio_data/Training/"
val_path =  "/content/drive/MyDrive/Colab_Notebooks/NLP/bio_data/Validation/"

In [8]:
train_folder = os.listdir(train_path)
val_folder = os.listdir(val_path)

In [9]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["subject"] = record["relation_info"][0]['subjectText']
    metadata["object"] = record['relation_info'][0]['objectText']
    metadata['publisher'] = record['publisher']
    metadata['journal'] = record['journal']

    return metadata

In [9]:
train_list = []

for i in sorted(train_folder):
    train_name = os.listdir(train_path + i)

    for j in tqdm(sorted(train_name)):
        path = train_path + i + '/' + j
        loader = JSONLoader(
            file_path=path,
            jq_schema='.',
            content_key="text",
            metadata_func=metadata_func
        )
        train_list.append(loader.load())

  0%|          | 0/3200 [00:00<?, ?it/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

In [10]:
val_list = []

for i in sorted(val_folder):
    val_name = os.listdir(val_path + i)

    for j in tqdm(sorted(val_name)):
        path = val_path + i + '/' + j
        loader = JSONLoader(
            file_path=path,
            jq_schema='.',
            content_key="text",
            metadata_func=metadata_func
        )
        val_list.append(loader.load())

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

In [11]:
output = [[text.page_content for text in value][0] for value in val_list]
output[0]

'The Contribution of Chemoattractant GPCRs, Formylpeptide Receptors, to Inflammation and Cancer A hallmark of inflammatory responses is leukocyte mobilization, which is mediated by pathogen and host released chemotactic factors that activate Gi-protein-coupled seven-transmembrane receptors (GPCRs) on host cell surface. Formylpeptide receptors (FPRs, Fprs in mice) are members of the chemoattractant GPCR family, shown to be critical in myeloid cell trafficking during infection, inflammation, immune responses, and cancer progression. Accumulating evidence demonstrates that both human FPRs and murine Fprs are involved in a number of patho-physiological processes because of their expression on a wide variety of cell types in addition to myeloid cells. The unique capacity of FPRs (Fprs) to interact with numerous structurally unrelated chemotactic ligands enables these receptors to participate in orchestrated disease initiation, progression, and resolution. One murine Fpr member, Fpr2, and it

In [13]:
val_doc = []

for value in val_list:
    temp = get_text_chunks(value)
    val_doc += temp

In [14]:
len(val_doc)

96205

In [16]:
val_doc[0].metadata

{'source': '/content/drive/MyDrive/Colab_Notebooks/NLP/bio_data/Validation/VL_Breast cancer(유방암)_1/PMC6993212.json',
 'seq_num': 1,
 'subject': 'murine Fprs',
 'object': 'myeloid cells',
 'publisher': 'Frontiers Media S.A.',
 'journal': 'Frontiers in Endocrinology'}

In [17]:
import chromadb
from chromadb.utils import embedding_functions

In [18]:
client = chromadb.PersistentClient(path="/content/drive/MyDrive/Colab_Notebooks/NLP/bioDB")
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [19]:
collection = client.create_collection(name="bio_test", embedding_function=default_ef)
collection = client.get_collection(name="bio_test", embedding_function=default_ef)

In [23]:
# 데이터가 너무 많아서 2000개만 입력하였고 3000개 이후부터 20개만 추가로 더 입력

for idx, value in tqdm(enumerate(val_doc[3000:])):
    collection.add(
        documents= value.page_content,
        metadatas = value.metadata,
        ids = "id" + str(idx + 3000)
    )
    if idx == 20:
        break

0it [00:00, ?it/s]

In [24]:
doc = collection.query(
    query_texts="murine Fprs이 뭐야?",
    n_results=10,
)