In [1]:
from dotenv import load_dotenv
import os
from PyPDF2 import PdfReader
import docx
from pptx import Presentation

In [153]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.chains import AnalyzeDocumentChain

In [154]:
load_dotenv()

True

In [155]:
#Text Loaders 
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_pptx(file_path):
    prs = Presentation(file_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
        elif filename.endswith(".pptx"):
            combined_text += read_pptx(file_path)
    return combined_text

In [156]:
train_directory = 'train_files/'
text = read_documents_from_directory(train_directory)

In [157]:
# split into chunks
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)

In [158]:
text_chunks = char_text_splitter.split_text(text)

In [159]:
# create embeddings
# Using HuggingFace embeddings instead of OpenAI
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
docsearch = FAISS.from_texts(text_chunks, embeddings)

In [173]:
# Create a prompt template for the LLM
prompt_template = """
Given the following documents, answer the question as best as you can.

Documents:
{documents}

Question: {question}
Answer:
"""

In [174]:

# Using HuggingFaceHub for the LLM
llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",  # Choose an appropriate model
    model_kwargs={"temperature": 0.5, "max_length": 512},
    huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)


In [175]:
# Create a PromptTemplate using the defined prompt
prompt = PromptTemplate(input_variables=["documents", "question"], template=prompt_template)

In [163]:
# Create the LLMChain
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [164]:
# Create the StuffDocumentsChain with the LLMChain
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="documents"  # Tell it which input is for the documents
)

In [165]:
# Example of how to run the chain
query = "What are the Main DevOps Practices"

In [166]:
# Assuming you have already created a docsearch (FAISS vector store) for similarity search
docs = docsearch.similarity_search(query)

In [167]:
# Debug print to ensure docs is a list of strings
print(docs)

[Document(id='580f3643-918d-4000-92f4-4e6db74b1f81', metadata={}, page_content='Silos\nEveryones shares the ownership of \nproduction and information is \nshared among everyone\nMeasure Everything\nApplication, systems monitoring \nand metrics etc... Implement Gradual \nChanges\nFrequent deployments, frequent \ndeterministic releases in small \nchunks which can be rolled backAccept Failure as Normal\nBlameless PMs/ RCA. Risk taking \nmindset.\nLeverage Tooling and \nAutomation\nAutomate and reduce manual \nwork as much as possibleKey Areas in DevOpsDevOps \nPractices\n•Continuous Integration (CI) -Software development practice where developers \nregularly merge their code changes into a central repository, after which automated \nbuilds and tests are run. \n•Continuous Delivery (CD) -Software development practice where code changes are \nautomatically built, tested, and prepared for a release to production (automated \ncode change deployment to staging/ pre -production system). \n•Cont

In [168]:
# Wrap each document content into a Document object
docs = [Document(page_content=doc.page_content) for doc in docs]

In [169]:
# Now invoke the chain with the wrapped Document objects
response = chain.invoke({
    "input_documents": docs,  # Pass the Document objects directly
    "question": query
})



ValueError: Model 'google/flan-t5-large' doesn't support task 'text2text-generation'.