In [1]:
!pip install langchain langchain-community langchainhub chromadb pypdf sentence-transformers langchain-groq




In [2]:
import os
from google.colab import files
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

In [3]:
os.environ["GROQ_API_KEY"] = "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"

In [7]:
def load_and_process_document():
    """
    Uploads and processes a valid PDF or TXT file. Retries until successful.
    """
    global qa_chain

    while True:
        try:
            print("📤 Please upload a .pdf or .txt file:")
            uploaded = files.upload()
            file_path = list(uploaded.keys())[0]
            print(f"📁 Uploaded: {file_path}")

            # Validate file extension
            if file_path.endswith(".pdf"):
                loader = PyPDFLoader(file_path)
            elif file_path.endswith(".txt"):
                loader = TextLoader(file_path)
            else:
                raise ValueError("❌ Invalid file type. Only .pdf and .txt are supported.")

            # Load and chunk
            documents = loader.load()
            splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
            chunks = splitter.split_documents(documents)
            print(f"📄 Loaded and split into {len(chunks)} chunks.")

            # Embed
            embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
            vectordb = Chroma.from_documents(chunks, embedding=embedding_model, persist_directory="./db")
            retriever = vectordb.as_retriever()

            # LLM
            llm = ChatGroq(model="llama3-70b-8192")

            # Chain
            qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type="stuff",
                retriever=retriever,
                return_source_documents=True
            )

            print("✅ Document processed and ready for Q&A.")
            break  # exit loop on success

        except ValueError as ve:
            print(str(ve))
            print("🔁 Let's try uploading again...\n")
        except Exception as e:
            print("⚠️ Unexpected error:", str(e))
            print("🔁 Try uploading a different file.\n")



In [8]:
load_and_process_document()

📤 Please upload a .pdf or .txt file:


Saving 1O6A5586 copy 2.jpg to 1O6A5586 copy 2.jpg
📁 Uploaded: 1O6A5586 copy 2.jpg
❌ Invalid file type. Only .pdf and .txt are supported.
🔁 Let's try uploading again...

📤 Please upload a .pdf or .txt file:


Saving 1O6A5586 copy 2.jpg to 1O6A5586 copy 2 (1).jpg
📁 Uploaded: 1O6A5586 copy 2 (1).jpg
❌ Invalid file type. Only .pdf and .txt are supported.
🔁 Let's try uploading again...

📤 Please upload a .pdf or .txt file:


Saving AccountStatement_NOV_2023.pdf to AccountStatement_NOV_2023.pdf
📁 Uploaded: AccountStatement_NOV_2023.pdf
📄 Loaded and split into 11 chunks.


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Document processed and ready for Q&A.


In [9]:
while True:
    q = input("Ask a question ('new' to upload new file, 'exit' to quit): ")
    if q.lower() == "exit":
        print("👋 Exiting. Goodbye!")
        break
    elif q.lower() == "new":
        load_and_process_document()
    else:
        result = qa_chain({"query": q})
        print("📌 Answer:", result["result"])


Ask a question ('new' to upload new file, 'exit' to quit): hello


  result = qa_chain({"query": q})


📌 Answer: Hello! It looks like you have a bank statement from QNB ALAHLI. Is there something specific you'd like to know or discuss about this statement?
Ask a question ('new' to upload new file, 'exit' to quit): what is the name of the account owner 
📌 Answer: The name of the account owner is MARWAN MANSOUR ABDELMOTLEB MOHAMED.
Ask a question ('new' to upload new file, 'exit' to quit): how much money he have now 
📌 Answer: According to the statement, the current balance is 4,591.51 EGP (Egyptian Pounds) as of 30-11-2023.
Ask a question ('new' to upload new file, 'exit' to quit): what was his expenses\
📌 Answer: Based on the provided statement, I can identify the following expenses:

1. CARD PURCHASE on 19-11-2023: 30.00 EGP
2. Family_expenses (ETRANSFER_ALY ABDELAZEIM GAD A) on 16-11-2023: 5,000.00 EGP
3. Family_expenses (ETRANSFER_ALY ABDELAZEIM GAD A) on 28-11-2023: 5,000.00 EGP
4. RECOVER OF UNPAID FEES on 09-11-2023: 7.00 EGP
5. OVERDRAFT COMMISSION on 30-11-2023: 0.03 EGP
6. DEBI

Saving Additional-Resources.pdf to Additional-Resources.pdf
📁 Uploaded: Additional-Resources.pdf
📄 Loaded and split into 9 chunks.
✅ Document processed and ready for Q&A.
Ask a question ('new' to upload new file, 'exit' to quit): hello 
📌 Answer: Hello! It seems you're interested in Git. What would you like to know about Git?
Ask a question ('new' to upload new file, 'exit' to quit): what is this 
📌 Answer: I'm not sure what this is. It appears to be a list of three identical words "Git" and one word "Pull", but without more context, it's difficult to determine what it represents or what it's used for. It's possible that it's related to Git, which is a version control system, but I'd need more information to provide a more specific answer.
Ask a question ('new' to upload new file, 'exit' to quit): eexit
📌 Answer: I don't know what you mean by "eexit". Could you please clarify or rephrase your question?
Ask a question ('new' to upload new file, 'exit' to quit): exit
👋 Exiting. Goodbye!
