In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader
loader = TextLoader("speech.txt")
text_documents = loader.load()
text_documents

[Document(page_content='Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈkoːɦli] ⓘ; born 5 November 1988) is an Indian international cricketer and the former captain of the Indian national cricket team. \nHe is a right-handed batsman and an occasional medium-fast bowler. He currently represents Royal Challengers Bengaluru in the IPL and Delhi in domestic cricket. \nKohli is widely regarded as one of the greatest batsmen of all time.[3] He holds the record as the highest run-scorer in T20I and IPL, \nranks third in ODI, and stands as the fourth-highest in international cricket.[4] He also holds the record for scoring the most centuries in ODI cricket and stands second in the list of most international centuries scored. \nKohli was a member of the Indian team that won the 2011 Cricket World Cup, 2013 ICC Champions Trophy, and captained India to win the ICC Test mace three consecutive times in 2017, 2018, and 2019.[5]', metadata={'source': 'speech.txt'})]

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

## Load, chunk and index the content of the html page

loader = WebBaseLoader(web_path= ("https://lilianweng.github.io/posts/2023-06-23-agent/ ",),
bs_kwargs=dict(parse_only=bs4.SoupStrainer(
    class_=("post-title","post-content","post-header")
)),)

text_documents = loader.load()

In [4]:
text_documents

[Document(page_content='', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/ '})]

In [5]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("LoRA- Low Rank Adaption LLMs.pdf")
LoRA = loader.load()

In [6]:
LoRA

[Document(page_content='LORA: L OW-RANK ADAPTATION OF LARGE LAN-\nGUAGE MODELS\nEdward Hu∗Yelong Shen∗Phillip Wallis Zeyuan Allen-Zhu\nYuanzhi Li Shean Wang Lu Wang Weizhu Chen\nMicrosoft Corporation\n{edwardhu, yeshe, phwallis, zeyuana,\nyuanzhil, swang, luw, wzchen }@microsoft.com\nyuanzhil@andrew.cmu.edu\n(Version 2)\nABSTRACT\nAn important paradigm of natural language processing consists of large-scale pre-\ntraining on general domain data and adaptation to particular tasks or domains. As\nwe pre-train larger models, full ﬁne-tuning, which retrains all model parameters,\nbecomes less feasible. Using GPT-3 175B as an example – deploying indepen-\ndent instances of ﬁne-tuned models, each with 175B parameters, is prohibitively\nexpensive. We propose Low-RankAdaptation, or LoRA, which freezes the pre-\ntrained model weights and injects trainable rank decomposition matrices into each\nlayer of the Transformer architecture, greatly reducing the number of trainable pa-\nrameters for downs

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents = text_splitter.split_documents(LoRA)
documents

[Document(page_content='LORA: L OW-RANK ADAPTATION OF LARGE LAN-\nGUAGE MODELS\nEdward Hu∗Yelong Shen∗Phillip Wallis Zeyuan Allen-Zhu\nYuanzhi Li Shean Wang Lu Wang Weizhu Chen\nMicrosoft Corporation\n{edwardhu, yeshe, phwallis, zeyuana,\nyuanzhil, swang, luw, wzchen }@microsoft.com\nyuanzhil@andrew.cmu.edu\n(Version 2)\nABSTRACT\nAn important paradigm of natural language processing consists of large-scale pre-\ntraining on general domain data and adaptation to particular tasks or domains. As\nwe pre-train larger models, full ﬁne-tuning, which retrains all model parameters,\nbecomes less feasible. Using GPT-3 175B as an example – deploying indepen-\ndent instances of ﬁne-tuned models, each with 175B parameters, is prohibitively\nexpensive. We propose Low-RankAdaptation, or LoRA, which freezes the pre-\ntrained model weights and injects trainable rank decomposition matrices into each\nlayer of the Transformer architecture, greatly reducing the number of trainable pa-', metadata={'source

In [9]:
## VEctor Embedding and Vector stores
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma, LanceDB, FAISS
db = Chroma.from_documents(documents[:2],OllamaEmbeddings())

In [None]:
## Chroma Vector database
query = "Who are the authors of LoRA research Paper"
result = db.similarity_search(query)
result[0].page_content

In [None]:
## Faiss vector database
db1 = FAISS.from_documents(documents[:2],OllamaEmbeddings())
query = "Who are the authors of LoRA research Paper"
result = db1.similarity_search(query)
result[0].page_content

In [None]:
## Lance vector database
db2 = Lancedb.from_documents(documents[:2],OllamaEmbeddings())
query = "Who are the authors of LoRA research Paper"
result = db2.similarity_search(query)
result[0].page_content