In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from langchain_openai import ChatOpenAI

chatmodel=ChatOpenAI(model = "gpt-3.5-turbo", temperature = 0.7, api_key=openai_api_key)

In [3]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader(r"C:\Users\HP\LangChain-Mastery\data\GD.txt")

loaded_data=loader.load()

In [4]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)


In [5]:
texts=text_splitter.create_documents(loaded_data[0].page_content)


In [6]:
len(texts)

3620

In [7]:
texts[0]

Document(metadata={}, page_content='1')

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=26,
    chunk_overlap=4
)

In [9]:
text = recursive_splitter.split_text(loaded_data[0].page_content)

In [10]:
text

['1. Basic Concepts',
 '- What is Gradient',
 'Descent?',
 'Gradient Descent is an',
 'an optimization algorithm',
 'used to minimize the loss',
 'function by iteratively',
 'updating model parameters',
 'in the direction of the',
 'the negative gradient of',
 'of the loss function with',
 'respect to the',
 'the parameters.',
 '- How does the learning',
 'rate affect the',
 'the convergence of',
 'of Gradient Descent?',
 'The learning rate',
 'determines the size of',
 'of the steps taken toward',
 'the minimum. A high',
 'learning rate may cause',
 'overshooting and',
 'and divergence, while a',
 'a low learning rate may',
 'may result in slow',
 'convergence or getting',
 'stuck in local minima.',
 '- What are the challenges',
 'of using a fixed learning',
 'rate?',
 'A fixed learning rate',
 'may not be suitable',
 'throughout the training',
 'process. It may start too',
 'too high, causing',
 'divergence, or too low,',
 'leading to slow',
 'convergence. It does not',
 'not adapt t

In [11]:
from langchain_openai import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings()

In [12]:
chunks_of_text =     [
        "Hi there!",
        "Hello!",
        "What's your name?",
        "Bond, James Bond",
        "Hello Bond!"
    ]

In [13]:
embeddings=embeddings_model.embed_documents(chunks_of_text)

In [14]:
embeddings

[[-0.02036280930042267,
  -0.0071092103607952595,
  -0.022863728925585747,
  -0.026253297924995422,
  -0.03750108554959297,
  0.021619616076350212,
  -0.006150736473500729,
  -0.009000767953693867,
  0.008492967113852501,
  -0.016630474478006363,
  0.026837268844246864,
  -0.007337720599025488,
  -0.013583669438958168,
  -0.024184010922908783,
  0.0064871544018387794,
  -0.020235858857631683,
  0.02428556978702545,
  -0.014751611277461052,
  0.01640196330845356,
  -0.01649082824587822,
  -0.00725520309060812,
  -0.008105769753456116,
  0.004706677980720997,
  -0.002047071699053049,
  -0.014840476214885712,
  -0.006007917691022158,
  -0.002086743712425232,
  -0.023028763011097908,
  0.01982961967587471,
  -0.03153442591428757,
  0.01284101139754057,
  0.011660374701023102,
  -0.008556442335247993,
  -0.009470484219491482,
  -0.001774128875695169,
  -0.027421239763498306,
  -0.008264456875622272,
  0.002086743712425232,
  0.024018974974751472,
  -0.008734173141419888,
  0.023485783487558

In [15]:
loaded_document = TextLoader(r'C:\Users\HP\LangChain-Mastery\data\ai_overview.txt').load()

In [16]:

from langchain.vectorstores import Chroma


In [17]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

In [18]:
chunks_of_text = text_splitter.split_documents(loaded_document)

In [19]:
vector_db = Chroma.from_documents(chunks_of_text, OpenAIEmbeddings())


In [20]:
vector_db

<langchain_community.vectorstores.chroma.Chroma at 0x1c1e37b1310>

# RAG 

In [21]:
import os
from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings

from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from dotenv import load_dotenv

In [22]:
def load_documents(folder_path):
    docs = []
    for file in os.listdir(folder_path):
        path = os.path.join(folder_path, file)
        if file.endswith(".pdf"):
            loader = PyPDFLoader(path)
        elif file.endswith(".txt"):
            loader = TextLoader(path)
        elif file.endswith(".docx"):
            loader = Docx2txtLoader(path)
        else:
            continue
        docs.extend(loader.load())
    return docs


In [23]:
def split_documents(docs , chuck_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chuck_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(docs)

In [24]:
def create_vectorstore(split_docs,persist_directory="chroma_store1"):
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(split_docs, embeddings , persist_directory=persist_directory)
    vectorstore.persist()
    return vectorstore

In [25]:
def setup_rag_chain(vectorstore):

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    llm = ChatOpenAI(temperature=0.2)
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [26]:
def ask_question(rag_chain, query):
    response = rag_chain(query)
    print("\n🧠 Answer:\n", response["result"])
    print("\n📚 Sources:")
    for doc in response["source_documents"]:
        print("-", doc.metadata.get("source", "Unknown Source"))



In [27]:
folder_path = "./data"
documents = load_documents(folder_path)
print(f"✅ Loaded {len(documents)} raw documents")

✅ Loaded 4 raw documents


In [28]:
split_docs = split_documents(documents)
print(f"✂️ Split into {len(split_docs)} chunks")

✂️ Split into 12 chunks


In [29]:
vectorstore = create_vectorstore(split_docs)
print("📦 Embeddings stored in vector DB (Chroma)")

  embeddings = OpenAIEmbeddings()


📦 Embeddings stored in vector DB (Chroma)


  vectorstore.persist()


In [30]:
rag_chain = setup_rag_chain(vectorstore)

  llm = ChatOpenAI(temperature=0.2)


In [31]:
user_query = "What is usiness model, at least at first."
ask_question(rag_chain, user_query)

  response = rag_chain(query)



🧠 Answer:
 I'm sorry, but based on the provided context, there is no specific information related to a "business model" or its definition. If you can provide more details or context, I'd be happy to help answer your question.

📚 Sources:
- ./data\GD.txt
- ./data\GD.txt
- ./data\AI_ML.pdf


# extra features 

In [32]:
from langchain.chains import RetrievalQA
from langchain.retrievers import EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI

from langchain.retrievers import BM25Retriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.schema import Document

In [33]:
documents = [
    Document(page_content="RAG stands for Retrieval-Augmented Generation. It combines retrieval with LLMs."),
    Document(page_content="RAG improves factual accuracy and reduces hallucinations in language models."),
    Document(page_content="RAG is useful in AI assistants, support bots, and knowledge base querying."),
]

In [34]:
embedding_model = OpenAIEmbeddings()
llm = OpenAI(temperature=0)

  llm = OpenAI(temperature=0)


In [35]:
vector_store = FAISS.from_documents(documents, embedding_model)


In [41]:
bm25_retriever = BM25Retriever.from_documents(documents)

In [42]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_store.as_retriever(), bm25_retriever],
    weights=[0.5, 0.5]
)

In [43]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=ensemble_retriever,
    llm=llm
)

In [39]:
print("The End")

The End
