In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
import os
from langchain.document_loaders import  PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()
api_key = os.getenv("NOMIC_API_KEY")
url = os.getenv("base_url")
doc_path = os.getenv("pdf_coop")

In [2]:
# path = "E:/RAG_Project/data/Understanding_Climate_Change.pdf"
loader = PyPDFLoader(doc_path)
documents = loader.load()
# documents = documents[:10]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [3]:
splits[7].page_content

'options: individual, joint , trust , \nlegaliz ed c ooper atives, and or gani-\nzations. \nUnrestricted deposits and with-\ndrawals, subjec t only t o regulations \nimposed b y relevant author ities.\nConvenienc e of se amless tr ansac -\ntions thr ough v arious c hannels, \nincluding A TM/POS car ds, int ernet \nbanking, and mobile ser vices.\nZero transac tion f ees, pr oviding our \ncust omer s with a c ost-effective \nand hassle -free b anking e xper i-\nence.\nPRODUC TS\nIs an int erest-bearing option designed t o cat er \nto a div erse range of cust omer s with a modest \ninitial deposit of just Bir r 50. ORDINAR Y \nPRODUC T OVERVIEW:SAVING A CCOUNT'

In [4]:
import re
document_texts = [doc.page_content for doc in documents]
def clean_text(page_text):
    # Example regex to remove header/footer by matching patterns (you can adjust based on the structure)
    page_text = re.sub(r"Header Pattern.*\n", "", page_text)
    page_text = re.sub(r"Footer Pattern.*\n", "", page_text)
    
    # Remove multiple spaces
    page_text = re.sub(r"\s+", " ", page_text)
    
    return page_text

def fix_broken_words(text):
    # Fix spaces between letters that should not have them (e.g., 'Preparedb y' -> 'Prepared by')
    # Handle capital letters separating words (e.g., 'DevelopmentT eam' -> 'Development Team')
    
    # First, fix cases where letters are unnecessarily split by spaces
    text = re.sub(r"(\w)\s+(\w)", r"\1\2", text)
    
    # Then handle cases where there are multiple capital letters or titles (e.g., 'DevelopmentT eam')
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    
    # Fix where spaces are missing after punctuation (e.g., 'T eamMAY,2024' -> 'Team MAY, 2024')
    text = re.sub(r"([.,!?])(\w)", r"\1 \2", text)
    
    # Additional custom fixes for common PDF issues
    text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)  # Handle numbers followed by words (e.g., '2024PRODUCT' -> '2024 PRODUCT')
    
    return text
cleaned_pages = [clean_text(page) for page in document_texts]
fixed_pages = [fix_broken_words(page) for page in cleaned_pages]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(documents)
splits = text_splitter.split_text("\n".join(fixed_pages))
splits[0]


'Preparedb y: Research & Development T eam MAY, 2024 PRODUCT CATALOG'

In [8]:
import nltk
import os

# Manually set the NLTK data path
nltk.data.path.append("/home/name-1/nltk_data")

# Tokenize and fix text using the word tokenizer
def nltk_fix_broken_words(text):
    tokens = nltk.word_tokenize(text)
    corrected_text = " ".join(tokens)
    return corrected_text

# Apply the function to clean the text
fixed_pages = [nltk_fix_broken_words(page) for page in cleaned_pages]

# Now split the fixed text using LangChain's text splitter
splits = text_splitter.split_text("\n".join(fixed_pages))

# Check the result
print(splits[0])

In [7]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_nomic import NomicEmbeddings

embeddings =  NomicEmbeddings(model="nomic-embed-text-v1.5",)
index = faiss.IndexFlatL2(len(embeddings.embed_query(" ")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [8]:
for split in splits:
    vector_store.add_documents(documents=split)

AttributeError: 'tuple' object has no attribute 'id'

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever = vector_store.as_retriever()
llm = ChatOpenAI(
                model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
                base_url=url,
                api_key="lm-studio"
            )
# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [15]:
answer= rag_chain.invoke({"input": "which company does sheryl Baxter work for?"})
answer['answer']

'Sheryl Baxter works for Rasmussen Group.'

In [16]:
# Sheryl
answer= rag_chain.invoke({"input": "what is  subscription date sheryl Baxter?"})
answer['answer']

"Sheryl Baxter's subscription date is 2020-08-24. She has another entry with a different last name, Meyers, but the subscription date for Sheryl Baxter is this one."