In [1]:
# !pip install chromadb langchain-chroma langchain langchain-community langchain-text-splitters langchain-groq transformers sentence-transformers unstructured unstructured[pdf] streamlit pandas 

## Import Required Libraries

In [1]:
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain.docstore.document import Document
import torch
from PIL import Image
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import numpy as np
import time
from tqdm import tqdm
import os
from IPython.display import display
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from langchain_chroma import Chroma
import chromadb.utils.embedding_functions as embedding_functions
import uuid

## Initialize or Create Chromadb

In [2]:
current_dir = os.getcwd()
data_folder_path = os.path.join(current_dir, "data")

client = chromadb.PersistentClient(path=data_folder_path)

In [3]:
#Print all Collections
collections = client.list_collections()
print("Collections:", collections)

Collections: [Collection(name=pdf_collection), Collection(name=text_collection), Collection(name=csv_collection)]


In [4]:
# # Print All documents from specific Collection
# #we have 3 collections object in this code text_collection, image_collection, csv_collection u can replace accordingly
# documents = text_collection.get()
# print("Documents in collection:", documents)

## Initialize all Embeddings and Api keys

In [5]:
#Only embedding For text based data 
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")




In [6]:
class HuggingFaceEmbeddingFunction:
    def __init__(self, embedding_function):
        self.embedding_function = embedding_function

    def __call__(self, input: str):
        # Generate embeddings
        embeddings = self.embedding_function.embed_documents([input])
        return np.array(embeddings[0])  # Ensure it's a numpy array

In [7]:
hugging_embedding = HuggingFaceEmbeddingFunction(embedding_function)

In [8]:
embedding_function


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
#Embedding for images data
image_embedding_function = OpenCLIPEmbeddingFunction()

In [10]:
os.environ["GROQ_API_KEY"]="your groq api key"

 # Text Handling

In [11]:
text_collection_name="pdf_collection"

In [12]:
text_collection = client.get_or_create_collection(
    name=text_collection_name,
    embedding_function=hugging_embedding,
)

In [13]:
def custom_loader(file_path):
    if file_path.endswith(".txt"):
        return TextLoader(file_path, encoding="utf-8")
    elif file_path.endswith(".csv"):
        return CSVLoader(file_path, encoding="utf-8")
    elif file_path.endswith(".pdf"):
        return PyMuPDFLoader(file_path)
    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        return ImageLoader(file_path)
    else:
        return UnstructuredFileLoader(file_path)

In [15]:
loader = DirectoryLoader(
    "pdf",
    glob="./*.*",  # Match all files
    loader_cls=custom_loader,
)

In [16]:
documents=loader.load()

In [17]:
# print(documents)

In [18]:
# Splitting text into Text chunks
text_splitter=CharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    )

In [19]:
text=text_splitter.split_documents(documents)

In [20]:
len(text)

230

In [20]:
# documents[0:1]

In [21]:
# Prepare data for Chroma
ids = []
contents = []
metadatas = []

In [22]:
for doc in documents:
    content = doc.page_content  # Extract the content of the document
    doc_id = doc.metadata['source'].split('/')[-1].replace(" ", "_")  
    doc_id = f"{doc_id}_{str(uuid.uuid4())}" # Create a unique ID based on the file name
    metadata = doc.metadata  # Use existing metadata or add more if needed

    ids.append(doc_id)
    contents.append(content)
    metadatas.append(metadata)


In [23]:
# contents

In [24]:
embeddings = embedding_function.embed_documents(contents)

In [26]:
# embeddings

In [None]:
text_collection.add(
    ids=ids,
    documents=contents,
    metadatas=metadatas,
    embeddings=embeddings
)
print("Documents added successfully.")

In [None]:
query_texts = "Why crispino cafe is not good?"
print(type(query_texts))  # Should be a list
print(query_texts)  # Should be a flat list of strings



In [None]:
results = text_collection.query(
    query_texts=query_texts,
    n_results=1
)

In [None]:
 results

#### Testing Embedding

In [None]:
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])
plt.title("PCA of Document Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

# CSV Handling

In [None]:
csv_collection_name="csv_collection"

In [None]:
csv_collection = client.get_or_create_collection(
    name=csv_collection_name,
    embedding_function=hugging_embedding,
)

In [None]:
batch_size = 5461

In [None]:
csv_file_path = 'SamaaNews.csv'  

In [None]:
# df

In [None]:
global_index = 0

def process_csv_file(csv_file_path):
    global global_index
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    ids = []
    contents = []
    metadatas = []

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        document_data = row.to_dict()
        content = str(document_data)  # Convert the row data to a string format
        
        # Create a unique ID by combining a prefix with the global index
        doc_id = f"{os.path.basename(csv_file_path).replace('.csv', '')}_doc_{global_index}"
        
        # Update the global index counter
        global_index += 1
        
        metadata = {
            "document_id": doc_id,
            "source": csv_file_path,
        }
        
        # Store the data in lists
        ids.append(doc_id)
        contents.append(content)
        metadatas.append(metadata)

    return ids, contents, metadatas




In [None]:
ids, contents, metadatas = process_csv_file(csv_file_path)

In [None]:
# metadatas

In [None]:
global_index

In [None]:
ids = []
contents = []
metadatas = []

for index, row in df.iterrows():
    document_data = row.to_dict()
    content = str(document_data)  # Convert the row data to a string format
    doc_id = f"doc_{index}"  # Unique ID for the document
    metadata = {
        "document_id": doc_id,
        "source": csv_file_path,
    }
    
    # Store the data in lists
    ids.append(doc_id)
    contents.append(content)
    metadatas.append(metadata)

In [None]:
# for doc_id, content, metadata in zip(ids, contents, metadatas):
#     print(f"Document ID: {doc_id}")
#     print(f"Content: {content}")
#     print(f"Metadata: {metadata}")
#     print("\n" + "-"*40 + "\n")  

In [None]:
def split_into_chunks(ids, contents, metadatas, batch_size):
    for i in range(0, len(ids), batch_size):
        yield ids[i:i + batch_size], contents[i:i + batch_size], metadatas[i:i + batch_size]

In [None]:
chunks = split_into_chunks(ids, contents, metadatas, batch_size)

In [None]:
chunks

In [None]:
def add_chunk_with_retries(ids_chunk, contents_chunk, metadatas_chunk, retries=3, delay=5):
    for attempt in range(retries):
        try:
            embeddings_chunk = embedding_function.embed_documents(contents_chunk)
            
            csv_collection.add(
                ids=ids_chunk,
                documents=contents_chunk,
                metadatas=metadatas_chunk,
                embeddings=embeddings_chunk
            )
            print(f"Successfully added {len(ids_chunk)} documents to the collection.")
            return True  # Indicate success
        except ValueError as e:
            print(f"Error: {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("Max retries reached. Skipping this chunk.")
                return False  # Indicate failure


In [None]:
for ids_chunk, contents_chunk, metadatas_chunk in chunks:
    success = add_chunk_with_retries(ids_chunk, contents_chunk, metadatas_chunk)
    if not success:
        print("Stopping further processing due to failure.")
        break  # Stop processing if a chunk fails after retries

# Creating Chain

In [None]:
# collection_name="csv_collection"

In [None]:
collection_name="text_collection"

In [None]:
vectordb = Chroma(
    collection_name=collection_name, 
    client=client, 
    embedding_function=embedding_function
)

In [None]:
retriver=vectordb.as_retriever()

In [None]:
retriver = vectordb.as_retriever(search_kwargs={"k": 4}) #to limit search list

In [None]:
llm=ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0
)

In [None]:
qa_chain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriver,
    return_source_documents=True,
    
)

In [None]:
query = "Why Ai is good?"
# concise_query = f"Answer briefly: {query}"
# concise_query = f"Summarize in a few sentences: {query}"
# concise_query = f"Provide a detailed explanation: {query}"
concise_query = f"List the key points about: {query} in bullet points."
# concise_query = f"Give a brief answer with examples about: {query}"
# concise_query = f"List the pros and cons of GPT briefly: {query}"


result = qa_chain.invoke({"query": concise_query})
answer = result["result"]
print("Answer:", answer)
print("**")
print(result["source_documents"][0].metadata["source"])

# Image Handling

In [None]:
# client = chromadb.PersistentClient(path=db_path)

In [None]:
# image_embedding_function = OpenCLIPEmbeddingFunction()

In [14]:
image_folder="images" #path of image folder

In [15]:
data_loader = ImageLoader()

In [16]:
image_collection = client.get_or_create_collection(
    name='image_collection',
    embedding_function=image_embedding_function,
    data_loader=data_loader
)

#### Function to add images to collection

In [17]:
def add_images_to_collection(folder_path):
    image_files = [os.path.join(folder_path, image_name) for image_name in os.listdir(folder_path)
                   if os.path.isfile(os.path.join(folder_path, image_name)) and image_name.lower().endswith(('.png', '.jpg', '.jpeg'))]

    for image_path in tqdm(image_files,desc="Creating Image Embeddings and Adding to DB"):
        try:
            image = np.array(Image.open(image_path))
            image_collection.add(
                ids=[os.path.basename(image_path)],
                images=[image]
            )
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

In [None]:
add_images_to_collection(image_folder)

Creating Image Embeddings and Adding to DB:  50%|███████████████████▌                   | 9/18 [00:02<00:02,  4.32it/s]

#### Query Image

In [None]:
query_text = "github?"
results = image_collection.query(
    query_texts=[query_text],
    n_results=3,
    include=["distances", "documents", "metadatas"]
)

In [None]:
results

In [None]:
for image_id, distance in zip(results['ids'][0], results['distances'][0]):
    # Handle image ID
    print(f"Result ID: {image_id}")
    print(f"Distance: {distance}")

    # Construct the path to the image
    image_path = os.path.join(image_folder, image_id)

    # Check if the image file exists
    if os.path.exists(image_path):
        try:
            # Open the image
            image = Image.open(image_path)
            
            # Display the image using IPython's display function
            display(image)
            
            print(f"Displaying image: {image_id}")
        except Exception as e:
            print(f"Could not display image: {e}")
    else:
        print(f"Image file not found: {image_path}")

    print("\n")  # Add space between results

In [None]:
def decode_image(embedding):
    # Assume that embedding is a flattened image array; reshape it to the original dimensions
    image_array = np.array(embedding).reshape((height, width, channels))
    image = Image.fromarray(np.uint8(image_array))
    return image

In [None]:
def query_and_display_images(query_text, n_results=3):
    results = image_collection.query(
        query_texts=[query_text],
        n_results=n_results,
        include=["distances", "documents", "metadatas"]
    )

    for image_id, embedding, distance in zip(results['ids'][0], results['documents'][0], results['distances'][0]):
        print(f"Result ID: {image_id}")
        print(f"Distance: {distance}")

        try:
            # Decode the image from the embedding vector
            image = decode_image(embedding)
            
            # Display the image using IPython's display function
            display(image)
            
            print(f"Displaying image: {image_id}")
        except Exception as e:
            print(f"Could not display image: {e}")

        print("\n")  # Add space between results

In [None]:
query_text = "github?"
query_and_display_images(query_text)

# CSV agent

In [None]:
from langchain_experimental.agents import create_csv_agent

In [None]:
agent = create_csv_agent(llm, 
                         'samaa_tv-vs-geo_jan-to-june-monthly-data.csv', 
                         verbose=True,
                         allow_dangerous_code=True)

In [None]:
# agent

In [None]:
agent.run("which channel samaa pr geo have higher views and what are reason?")

In [None]:
 agent.run("Highest views video from every subcategory_news_program?")