## RAG Pipeline with Vector Database.

In [18]:
## Data Ingestion
# This script is used to ingest data from a source and process it for further analysis.

from langchain_community.document_loaders import TextLoader
loader = TextLoader("../cc.txt", encoding="utf-8")
text_doc = loader.load()
text_doc

[Document(page_content="6.1+ years of industry experience in Python Scripting, Django, Flask,\nFast-API, Postgres SQL, MongoDB, Elastic Search, MySQL, GCP,\nRabbitMQ, Big Query, Pub-Sub, AlloyDB, Jira, Agile, Micro-Services &\nMainframe Operations, Tools and Techniques- WebEnabler,\nOperation Sentinel Console, MISER\nPython Developer\n66 Degrees Internation PVT. LTD.(Formally QWINIX\nTECHNOLOGIES PVT. LTD), Mysuru\nPLATFORM: Python, Fast-API, Flask, GCP, GIT, Docker,\nMicro-Services.\nThese technologies have been used for Cloud\nNative and Product Modernization, which is\nbeing designed to migrate the Legacy System\nDatabase records to Google Cloud\nPlatform(GCP) to solve the real time cloud\nproblems for Large Industries.\nPROJECT DESCRIPTIONS:\nData Validation Tools(DVT) is an open-source\nrepository provided by Google, which allows the\nsmooth Migration of Legacy Databases across the\nGoogle Cloud Platform Databases.\nThis is micro-service has been developed to\nreceived the various

In [19]:
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [21]:
## Web Based Loader
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_path="https://kb.objectrocket.com/elasticsearch/how-to-use-python-helpers-to-bulk-load-data-into-an-elasticsearch-index",
    bs_kwargs=dict(parse_only=bs4.SoupStrainer(
        class_=("wrap", "content", "sidebar", "table-of-contents")
    ))
)
web_doc = loader.load()
web_doc

[Document(page_content='\n\n\n\nContents\n\nIntroduction\nPrerequisites\nGet the JSON data “actions” object ready\nTwo key benefits of helpers of bulk API’s\nJSON file bulk document indexing – use a custom generator\nConclusion\nPython helpers bulk load elasticsearch: The entire code\n\n\n\n\n\n\n\nResourcesElasticsearchHow to use Python helpers to bulk load data into an Elasticsearch index\n\n\nHow to use Python helpers to bulk load data into an Elasticsearch index\n\n\n\n\nWritten by Data Pilot\nApril 09, 2019\n\n\n\n\nElasticsearch\nPython\n\n\n\n\n\n Subscribe\n\t\t\n\n\n\n Like\n\t\t\n\n \n\n\n\nHave a Database Problem?  Speak with an Expert for Free\n\t\t\n Get Started >>\n\n\n\n\nIntroduction\nPython helpers do exactly what they say: They help you get things done. One of the most efficient ways to streamline indexing is through the helpers.bulk method. Indexing large datasets without putting them into memory is the key to expediting search results and saving system resources. Le

In [22]:
## PDF Reader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("../bb.pdf")
pdf_doc = loader.load()
pdf_doc

incorrect startxref pointer(3)
parsing for Object Streams


[Document(page_content='Python Developer\nWork History\nContact\nWWW\nhttps://bold.pro\n/my/kaoushik-kumar-\n230928234654/465\nLinkedIn\nTechnical Profile\nKaoushikKumar\n6.1+ years of industry experience in Python Scripting, Django, Flask,\nFast-API, Postgres SQL, MongoDB, Elastic Search, MySQL, GCP,\nRabbitMQ, Big Query, Pub-Sub, AlloyDB, Jira, Agile, Micro-Services &\nMainframe Operations, Tools and Techniques- WebEnabler,\nOperation Sentinel Console, MISER\nPython Developer\n66 Degrees Internation PVT. LTD.(Formally QWINIX\nTECHNOLOGIES PVT. LTD), Mysuru\nPLATFORM: Python,Fast-API,Flask, GCP, GIT, Docker,\nMicro-Services.\nThese technologies have been used for Cloud\nNative and Product Modernization, which is\nbeing designed to migrate the Legacy System\nDatabase records to Google Cloud\nPlatform(GCP) to solve the real time cloud\nproblems for Large Industries.\nPROJECT DESCRIPTIONS:\nData Validation Tools(DVT) is an open-source\nrepository provided by Google, which allows the\nsmo

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_split = text_splitter.split_documents(pdf_doc)
text_split

[Document(page_content='Python Developer\nWork History\nContact\nWWW\nhttps://bold.pro\n/my/kaoushik-kumar-\n230928234654/465\nLinkedIn\nTechnical Profile\nKaoushikKumar\n6.1+ years of industry experience in Python Scripting, Django, Flask,\nFast-API, Postgres SQL, MongoDB, Elastic Search, MySQL, GCP,\nRabbitMQ, Big Query, Pub-Sub, AlloyDB, Jira, Agile, Micro-Services &\nMainframe Operations, Tools and Techniques- WebEnabler,\nOperation Sentinel Console, MISER\nPython Developer\n66 Degrees Internation PVT. LTD.(Formally QWINIX\nTECHNOLOGIES PVT. LTD), Mysuru\nPLATFORM: Python,Fast-API,Flask, GCP, GIT, Docker,\nMicro-Services.\nThese technologies have been used for Cloud\nNative and Product Modernization, which is\nbeing designed to migrate the Legacy System\nDatabase records to Google Cloud\nPlatform(GCP) to solve the real time cloud\nproblems for Large Industries.\nPROJECT DESCRIPTIONS:\nData Validation Tools(DVT) is an open-source\nrepository provided by Google, which allows the\nsmo

In [24]:
## Vector Embedding and Vector Store
from langchain_community.embeddings import OpenAIEmbeddings  # Ensure you have set the OPENAI_API_KEY in your environment
from langchain_community.embeddings import OllamaEmbeddings  # Ensure you have Ollama installed and running
from langchain_community.embeddings import HuggingFaceEmbeddings  # Ensure you have the HuggingFace model downloaded
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(text_split, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
db

<langchain_community.vectorstores.faiss.FAISS at 0x27b997707d0>

In [25]:
query = "problems for Large Industries"  # Replace with your query sentence or search term.
retreival = db.similarity_search(query=query)
retreival[0].page_content

'problems for Large Industries.\nPROJECT DESCRIPTIONS:\nData Validation Tools(DVT) is an open-source\nrepository provided by Google, which allows the\nsmooth Migration of Legacy Databases across the\nGoogle Cloud Platform Databases.\nThis is micro-service has been developed to\nreceived the various payload as Databases\nconnection request which will be responsible for\nmigrating legacy system DBs to Cloud DB.\nPython is being used in backend to solve the\nchallenging problems of the Large Enterprises\ncompanies to structuring there DB.\nWorked on Fast-API, Flask, GCP, GitHub, Jira,\nDocker, AlloyDB.\n2023-03 -\n2023-09\nSenior Software Engineer\nImpact Big Data Analytics Pvt. Ltd, Bangalore\nPLATFORM: Python, Fast-API, PostgreSQL, GIT, Micro-\n2022-04 -\n2023-03\nAddress\nBangalore, Karnataka\n560036\nPhone\n+91 8608121704\nE-mail\nKaoushikkumarr@gmail.co\nm\nhttps://www.linkedin.com\n/in/kaoushik-kumar-\n99426060/\nPython\nFast-API\nFlask\nDjango'