In [None]:
%%capture
!pip install --upgrade gspread
!pip install langchain
!pip install langchain_community
!pip install accelerate
!pip install transformers
!pip install bitsandbytes
!pip install sentence_transformers
!pip install flash_attn
!pip install xformers
!pip install chromadb
!pip install -U langchain langchain-chroma

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import gspread
from oauth2client.client import GoogleCredentials

In [None]:
# Import necessary libraries
from google.colab import auth
import gspread
from google.auth import default
# gc = gspread.authorize(GoogleCredentials.get_application_default())
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings  # Import HuggingFaceEmbeddings

def get_embedding_function():
    # Initialize HuggingFace embeddings with the LaBSE model
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE")
    return embeddings

In [None]:
# Constants for paths
CHROMA_PATH = "chroma"
DATA_PATH = "/content/drive/MyDrive/"  # Update the path to your desired folder

In [None]:
import argparse
import os
import shutil
import sys
import time
from google.colab import auth
import gspread
from google.auth import default
from google.colab import drive
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import numpy as np

# Initialize Google Drive and Sheets services using authenticated credentials
drive_service = build('drive', 'v3', credentials=creds)
sheets_service = build('sheets', 'v4', credentials=creds)

def main():
    # Filter out unwanted arguments added by Jupyter/IPython.
    filtered_args = [arg for arg in sys.argv if arg.startswith("--reset")]

    # Check if the database should be cleared (using the --reset flag).
    parser = argparse.ArgumentParser()
    parser.add_argument("--reset", action="store_true", help="Reset the database.")
    args = parser.parse_args(filtered_args)  # Use the filtered arguments
    if args.reset:
        print("✨ Clearing Database")
        clear_database()

    # Create (or update) the data store.
    documents = load_google_documents()
    chunks = split_documents(documents)
    add_to_chroma(chunks)

def calculate_chunk_ids(chunks):
  """Generates unique chunk IDs for each document chunk."""
  last_page_id = None
  current_chunk_index = 0
  for chunk in chunks:
      source = chunk.metadata.get("source")
      page = chunk.metadata.get("page", 1)
      current_page_id = f"{source}:{page}"
      if current_page_id == last_page_id:
          current_chunk_index += 1
      else:
          current_chunk_index = 0
      chunk.metadata["id"] = f"{current_page_id}:{current_chunk_index}"
      last_page_id = current_page_id
  return chunks


def load_google_documents():
    """Loads Google Docs, Sheets, and Slides from Google Drive and extracts their content."""
    results = drive_service.files().list(
        q="mimeType='application/vnd.google-apps.spreadsheet' or mimeType='application/vnd.google-apps.document' or mimeType='application/vnd.google-apps.presentation'",
        pageSize=30,
        fields="nextPageToken, files(id, name, mimeType)"
    ).execute()

    files = results.get('files', [])
    documents = []

    if not files:
        print('No Google Sheets, Google Docs, or Google Slides found.')
    else:
        print('Google Sheets, Google Docs, and Google Slides:')
        for file in files:
            file_id = file['id']
            file_name = file['name']
            mime_type = file['mimeType']
            print(f"Name: {file_name}, ID: {file_id}, Type: {mime_type}")

            # Fetch the content based on the file type.
            content = fetch_content_from_google(file_id, mime_type)
            if content:
                # Set the source as "filename:fileID" and id as the Google Drive file ID
                source = f"{file_name}:{file_id}"
                documents.append(Document(page_content=content, metadata={"id": file_id, "source": source}))  # Set the Google Drive ID here

    return documents

def fetch_content_from_google(file_id, mime_type):
    """Fetches content from Google Drive files based on their type."""
    if mime_type == 'application/vnd.google-apps.document':
        request = drive_service.files().export(fileId=file_id, mimeType='text/plain')
        content = request.execute().decode('utf-8')
        return content

    elif mime_type == 'application/vnd.google-apps.spreadsheet':
        try:
            sheet_metadata = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
            sheets = sheet_metadata.get('sheets', '')
            sheet_name = sheets[0]['properties']['title'] if sheets else 'Sheet1'
            print(f"Using sheet name: {sheet_name}")

            max_retries = 3
            for attempt in range(max_retries):
                try:
                    result = sheets_service.spreadsheets().values().get(
                        spreadsheetId=file_id, range=f"{sheet_name}!A1:Z1000"
                    ).execute()
                    values = result.get('values', [])
                    return "\n".join([", ".join(row) for row in values])
                except HttpError as e:
                    if e.resp.status in [502, 503, 504]:
                        print(f"Temporary error ({e.resp.status}). Retrying... ({attempt+1}/{max_retries})")
                        time.sleep(2 ** attempt)
                    else:
                        print(f"Failed to read Google Sheet: {e}")
                        break
            return None

        except HttpError as e:
            print(f"Failed to retrieve metadata for Google Sheet: {e}")
            return None

    elif mime_type == 'application/vnd.google-apps.presentation':
        slides_service = build('slides', 'v1', credentials=creds)
        presentation = slides_service.presentations().get(presentationId=file_id).execute()
        slides = presentation.get('slides', [])
        content = ""
        for slide in slides:
            for element in slide.get('pageElements', []):
                if 'shape' in element and 'text' in element['shape']:
                    text_elements = element['shape']['text']['textElements']
                    for text_element in text_elements:
                        if 'textRun' in text_element:
                            content += text_element['textRun']['content']
        return content
    else:
        print(f"Unsupported mime type: {mime_type}")
        return None

def split_documents(documents):
    """Splits large documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks):
    """Adds the split document chunks to Chroma DB and removes orphaned documents."""
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Retrieve existing document IDs from Chroma DB
    existing_items = db.get(include=[])  # Get existing document IDs
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Calculate the new chunks to be added
    chunks_with_ids = calculate_chunk_ids(chunks)
    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    # Add new documents to Chroma
    if new_chunks:
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        print("✅ Documents added to Chroma DB.")
    else:
        print("✅ No new documents to add")

    # Check if documents exist in Chroma after addition
    existing_after_addition = db.get(include=[])
    print(f"Number of documents in DB after addition: {len(existing_after_addition['ids'])}")

    # Retrieve current Google Drive file IDs
    drive_file_ids = get_google_drive_file_ids()
    print(f"Google Drive IDs: {drive_file_ids}")

    # Find orphaned IDs
    print(f"File Names: {existing_ids}")
    existing_ids = {i.split(":")[1] for i in existing_ids}
    print(f"New Existing IDs: {existing_ids}")
    print(len(existing_ids),len(drive_file_ids))
    orphaned_ids = existing_ids - drive_file_ids  # Find IDs in DB but not in Google Drive
    # Filter out values that are not 'None'

    print(f"Identified orphaned IDs: {orphaned_ids}")

    if orphaned_ids:
      print(f"🗑 Found orphaned documents: {len(orphaned_ids)}")
      all_ids = get_all_ids(db)
      print("List of all IDs in the Chroma database:")
      print(all_ids)

      # Using list comprehension to find orphaned document IDs in the database
      orphaned_doc_ids_in_db = [i for i in all_ids for j in orphaned_ids if j in i]

      # Printing the orphaned document IDs
      for orphaned_id in orphaned_doc_ids_in_db:
          print(f"Orphaned Doc id: {orphaned_id}")
      delete_orphaned_documents(db,orphaned_doc_ids_in_db)

    else:
        print("✅ No orphaned documents to remove")


def get_all_ids(db):
    """
    Retrieves all document IDs from the Chroma database.

    Parameters:
        db (Chroma): The Chroma database instance.

    Returns:
        list: A list of document IDs in the Chroma DB.
    """
    try:
        # Retrieve all documents and their metadata from the Chroma DB
        all_documents = db.get(include=["documents", "metadatas"])

        # Extract the document IDs from the metadata
        all_ids = [doc['id'] for doc in all_documents['metadatas']]
        return all_ids

    except Exception as e:
        print(f"Error occurred while fetching IDs: {e}")
        return []

def find_document_by_id(db, doc_id):
    """
    Finds all entries in the Chroma DB associated with a given document ID.

    Parameters:
        db (Chroma): The Chroma database instance.
        doc_id (str): The document ID to search for.

    Returns:
        list: A list of entries associated with the provided ID, or an empty list if not found.
    """
    try:
        # Query the Chroma DB for the specific document ID
        result = db.get(include=["documents", "metadatas"])

        # Filter documents by the specific ID
        matching_documents = [
            doc for doc, meta in zip(result['documents'], result['metadatas'])
            if meta['id'] == doc_id
        ]

        return matching_documents

    except Exception as e:
        print(f"Error occurred while fetching document with ID {doc_id}: {e}")
        return []

def delete_orphaned_documents(db, orphaned_ids):
    """Deletes documents from the Chroma DB based on orphaned Google Drive file IDs."""
    if orphaned_ids:
        print(f"🗑 Deleting {len(orphaned_ids)} orphaned documents.")
        db.delete(ids=list(orphaned_ids))
        db.persist()
        print("✅ Orphaned documents deleted from Chroma DB.")
    else:
        print("✅ No orphaned documents to delete.")


def get_google_drive_file_ids():
    """Retrieves all Google Drive file IDs."""
    results = drive_service.files().list(
        q="mimeType='application/vnd.google-apps.spreadsheet' or mimeType='application/vnd.google-apps.document' or mimeType='application/vnd.google-apps.presentation'",
        fields="files(id)"
    ).execute()

    files = results.get('files', [])
    return {file['id'] for file in files}  # Use a set for quick lookup

def clear_database():
    """Clears the Chroma database."""
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print("Chroma path found")
    else:
      print("Chroma path not found")
    print("✨ Chroma database cleared.")

if __name__ == "__main__":
    main()


In [None]:
from datetime import datetime, timedelta

def get_one_day_back_timestamp():
    """Returns a timestamp representing the date and time 24 hours ago."""
    one_day_back = datetime.now() - timedelta(days=1)
    return one_day_back.isoformat() + 'Z'  # Convert to RFC3339 format for comparison with modifiedTime

def count_updated_files_in_past_day():
    """Checks which files were updated in the past day and returns a list of tuples (file_id, mime_type)."""
    one_day_back_timestamp = get_one_day_back_timestamp()

    # Fetch all Google Drive files (Google Docs, Sheets, and Slides)
    results = drive_service.files().list(
        q="mimeType='application/vnd.google-apps.spreadsheet' or mimeType='application/vnd.google-apps.document' or mimeType='application/vnd.google-apps.presentation'",
        fields="files(id, name, mimeType, modifiedTime)"
    ).execute()

    files = results.get('files', [])
    updated_files = []

    if not files:
        print('No files found in Google Drive.')
    else:
        # Iterate over the files and check if they were updated in the last 24 hours
        for file in files:
            modified_time = file['modifiedTime']

            # Compare the modifiedTime with one_day_back_timestamp
            if modified_time > one_day_back_timestamp:
                updated_files.append((file['id'], file['mimeType']))  # Append ID and MIME type

    print(f"Total files updated in the past 24 hours: {len(updated_files)}")
    return updated_files

def update_db_with_updated_files(db):
    """Updates the Chroma DB with updated Google Drive files."""

    # Step 1: Get the list of files updated in the past 24 hours
    updated_files = count_updated_files_in_past_day()

    if not updated_files:
        print("✅ No updated files to process.")
        return

    print(f"🔄 Updating Chroma DB with {len(updated_files)} updated files.")

    # Step 2: For each updated file, remove existing entries from Chroma DB and re-add them
    for file_id, mime_type in updated_files:
        try:
            # Remove existing entries for the file
            print(f"🗑 Removing old entries for file ID: {file_id}")
            all_ids = get_all_ids(db)
            orphaned_doc_ids_in_db = [i for i in all_ids if file_id in i]
            delete_orphaned_documents(db, orphaned_doc_ids_in_db)

            # Fetch the updated content from Google Drive
            print(f"📄 Fetching updated content for file ID: {file_id}")
            file_content = fetch_content_from_google(file_id, mime_type)  # Use the file's MIME type to fetch content

            if file_content is None:
                print(f"⚠ No content fetched for file ID: {file_id}. Skipping...")
                continue

            # Convert the content into a Document object with metadata
            document = Document(page_content=file_content, metadata={"id": file_id, "source": f"{file_id}:{mime_type}"})

            # Split the document content into chunks (if needed)
            print(f"✂ Splitting content into chunks for file ID: {file_id}")
            chunks = split_documents([document])  # Make sure `split_documents` returns chunks as Document objects

            # Add the updated chunks to Chroma
            print(f"➕ Adding updated content for file ID: {file_id} into Chroma DB")
            add_updated_file_to_chroma(db,chunks)  # Pass the chunks to add_to_chroma function

            print(f"✅ Successfully updated file ID: {file_id} in Chroma DB.")

        except Exception as e:
            print(f"❌ Error processing file ID {file_id}: {e}")

    print("🔄 Database update process completed.")

def add_updated_file_to_chroma(db,chunks):
    # Retrieve existing document IDs from Chroma DB
    existing_items = db.get(include=[])  # Get existing document IDs
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Calculate the new chunks to be added
    chunks_with_ids = calculate_chunk_ids(chunks)
    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    # Add new documents to Chroma
    if new_chunks:
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        print("✅ Documents added to Chroma DB.")
    else:
        print("✅ No new documents to add")

    # Check if documents exist in Chroma after addition
    existing_after_addition = db.get(include=[])
    print(f"Number of documents in DB after addition: {len(existing_after_addition['ids'])}")

In [None]:
db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )
update_db_with_updated_files(db)


Total files updated in the past 24 hours: 1
🔄 Updating Chroma DB with 1 updated files.
🗑 Removing old entries for file ID: 1A_6ZQ0Y_tpvhOMgWSkAu4rwZNSKdel5WbSrGyFRXxdg
🗑 Deleting 6 orphaned documents.
✅ Orphaned documents deleted from Chroma DB.
📄 Fetching updated content for file ID: 1A_6ZQ0Y_tpvhOMgWSkAu4rwZNSKdel5WbSrGyFRXxdg
✂ Splitting content into chunks for file ID: 1A_6ZQ0Y_tpvhOMgWSkAu4rwZNSKdel5WbSrGyFRXxdg
➕ Adding updated content for file ID: 1A_6ZQ0Y_tpvhOMgWSkAu4rwZNSKdel5WbSrGyFRXxdg into Chroma DB
Number of existing documents in DB: 182
👉 Adding new documents: 6
✅ Documents added to Chroma DB.
Number of documents in DB after addition: 188
✅ Successfully updated file ID: 1A_6ZQ0Y_tpvhOMgWSkAu4rwZNSKdel5WbSrGyFRXxdg in Chroma DB.
🔄 Database update process completed.


In [None]:
from langchain.vectorstores import Chroma

# Load the existing Chroma database
db = Chroma(persist_directory=CHROMA_PATH)

# Fetch all documents from the Chroma database
all_documents = db.get(include=["documents", "metadatas"])  # 'data' might also be an option based on your version
# Print the contents of each document and their metadata
for doc, metadata in zip(all_documents["documents"], all_documents["metadatas"]):
    print(f"Document: {doc}")  # This will show the actual document content
    print(f"Metadata: {metadata}")  # This will show the associated metadata
    print("-" * 40)


In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import torch

# Set up BitsAndBytesConfig for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

# Load the LLaMA 2 tokenizer and 4-bit quantized model
model_name = 'meta-llama/Llama-2-7b-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Define constants
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def query_rag(query_text: str):
    # Prepare the DB
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB for relevant context
    results = db.similarity_search_with_score(query_text, k=5)

    # Format the context from the retrieved documents
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Tokenize and process the prompt
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(model.device)

    # Generate response using the model with fine-tuned settings
    response_ids = model.generate(
        input_ids,
        max_new_tokens=200,  # Adjust this to control the length of the output
        temperature=0.7,     # Control randomness in generation
        top_k=50,            # Consider only top 50 tokens during sampling
        top_p=0.9,           # Use nucleus sampling
        do_sample=True       # Enable sampling
    )
    response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)

    # Gather metadata and return results
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text, {"sources": sources}


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
# Example usage in a Jupyter notebook cell
input_query = "Your query for the RAG goes here"
response, metadata = query_rag(input_query)