In [95]:
import requests
import pandas
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
from supabase.client import Client, create_client
import datetime
from tqdm import tqdm
import pinecone
import os
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

load_dotenv()

True

In [3]:
index_name = "cadmir"
namespace = "cad1"
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)

In [50]:
embedding = OpenAIEmbeddings()
# supabase: Client = create_client(supabase_url=os.environ.get("SUPABASE_URL"), supabase_key=os.environ.get("SUPABASE_SERVICE_KEY"))
# store = SupabaseVectorStore(embedding=embedding, client=supabase, table_name="knowledge")
# store.similarity_search("miRNA",400)

In [43]:
index = pinecone.Index("cadmir")
pinecones = Pinecone(index, embedding.embed_query, 'text' )

In [29]:
docs

[Document(page_content='2.1.3. MiR-1', metadata={'add_date': datetime.date(2023, 8, 28), 'doi': '10.3390/cells10113191'}),
 Document(page_content='This study also revealed that mir-21 expressions were evidently increased in geriatric subjects compared to youn- ger people, suggesting a signiﬁcant link with the aging pro- cess. Wang et al. and also our prior study have identiﬁed in elderly subjects remarkably elevated mir-21 and associa- tion with the cardiorenal syndrome and aging [14, 23].\n\nData Availability\n\nData are available on request.\n\nConflicts of Interest', metadata={'add_date': datetime.date(2023, 8, 28), 'doi': '10.1155/2022/9661940'}),
 Document(page_content='miR-21\n\n(e)\n\n(f)\n\n(g)\n\n(h)\n\n15\n\n0.8\n\n200\n\n400\n\n150\n\n300\n\n0.6\n\n10\n\n2 - l c B\n\nP B D\n\nR H\n\nP B S\n\n0.4\n\n100\n\n200\n\n5\n\n50\n\n100\n\n0.2\n\nr=–0.844 P=0.275\n\nr=0.703 P=0.275\n\nr=0.786 P=0.275\n\nr=0.833 P=0.275\n\n0\n\n0\n\n0\n\n0.0\n\n0\n\n1\n\n2\n\n3\n\n0\n\n1\n\n2\n\n3\n\n0

In [36]:
context_texts = []
for i in docs:
    context_texts.append(i.page_content)

In [68]:
llm = OpenAI()
cadmirIndex = Pinecone.from_existing_index("cadmir", embedding, "text", namespace="cad1")

In [89]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), retriever=cadmirIndex.as_retriever(search_kwargs={'k':10}), return_source_documents=True)

In [106]:
qa({"query":"list some text, no references"})

{'query': 'list some text, no references',
 'result': " In this study, RNA-sequencing (RNA-Seq) was used to explore the transcriptome characteristic shift in pericardial adipose tissue (PAT) from humans with or without coronary artery disease (CAD). 30 patients with UA (18 males) and 15 healthy control (4 males) patients were recruited in Guang'annmen Hospital, Beijing, China. Total RNA was isolated from adipose tissue biopsies and used for real-time PCR. Image-Pro Plus software was employed to analyze the images. Cuffdiff was used to estimate the abundance of all transcripts based on the final transcriptome assembled from the RNA-Seq data. Finally, bioinformatics analyses were used to identify the transcriptome characteristic shift in PAT from humans with or without CAD, as well as the upregulation in inﬂammatory processes of PAT in CAD patients.",
 'source_documents': [Document(page_content='and wrote the draft together; M.S. prepared the ﬁgures. All authors have read and agreed to t

In [48]:
pinecones.as_retriever()

VectorStoreRetriever(tags=['Pinecone'], metadata=None, vectorstore=<langchain.vectorstores.pinecone.Pinecone object at 0x1527b11e0>, search_type='similarity', search_kwargs={})

In [102]:
docs = cadmirIndex.similarity_search(query="mir-1", namespace="cad1", k=10)

In [104]:
# Define prompt
prompt_template = """Write a wikipedia style page for miRNA-1 using the following information. Use at least 900 words.
Include sections for
- Overview
- Function
- Clinical Significance
- Current Research

Here are the information:"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text"
)

print(stuff_chain.run(docs))

miRNA-1 (miR-1) is a muscle-specific microRNA that plays a crucial role in various physiological and pathological processes. It is primarily expressed in cardiac and skeletal muscles and is involved in the development of embryonic stem cells and cardiomyocyte progenitor cells. MiR-1 is encoded by two distinct genes, miR-1-1 and miR-1-2, and exists in a cluster with miR-133. It regulates the expression of many cardiac transcription factors and signaling pathways, including myocardin, Nkx2.5, serum response factor (SRF), WNT, and FGF signaling pathways.

Functionally, miR-1 is involved in the differentiation of cardiac precursor cells and the regulation of calcium cycling in muscle tissue. It has been shown to be released into the circulation via exosomes and activate myeloid progenitor cells, leading to a systemic response to myocardial injury. MiR-1 has also been extensively studied as a potential biomarker for acute coronary syndrome (ACS). Meta-analyses have reported its diagnostic a