In [None]:
!pip install --upgrade --quiet langchain langchain-community langchain-groq chromadb
!pip install sentence-transformers langchain-chroma gdown

In [None]:
import os
import gdown
import sqlite3

# Install gdown if not already installed

# Specify the Google Drive file ID
file_id = '1-6tbviD3f4ofkAG7WeouY5QEOrOi-dcA'
destination = 'data.db'  # Name to save the file as (you can change 'data.db' if needed)

# Download the file using gdown
gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

# Check if the file is downloaded successfully
if os.path.exists(destination):
    print(f"File '{destination}' downloaded successfully.")
else:
    print(f"Failed to download the file '{destination}'.")

In [None]:
import os
import json

from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()

os.environ['KAGGLE_USERNAME'] = secrets.get_secret("KAGGLE_USERNAME")
os.environ['KAGGLE_KEY'] = secrets.get_secret("KAGGLE_KEY")
# Change below
meta = dict(
    id="mohamedmahmod/RAGdata",
    title="RAG_data",
    isPrivate=False,
    licenses=[dict(name="other")]
)

with open('/kaggle/working/dataset-metadata.json', 'w') as f:
    json.dump(meta, f)



In [None]:
!kaggle datasets create -p "/kaggle/working" --dir-mode zip

In [None]:
import sqlite3
import uuid
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


# Define a simple Document class with metadata
class Document:
    def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata is not None else {}

# Define a function to load the SQLite database and retrieve question-answer pairs
def load_data_from_sqlite(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Fetch all questions and answers
    cursor.execute("SELECT q_body,a_body from ds5b where category_id=14")  # Replace with your actual table name
    rows = cursor.fetchall()

    # Create chunks (each question-answer pair is a chunk)
    chunks = []
    for row in rows:
        question, answer = row
        # You can add more metadata if needed
        metadata = {"question": question}  # Example metadata
        chunk = Document(content=f"Q: {question}\nA: {answer}", metadata=metadata)
        chunks.append(chunk)

    conn.close()
    return chunks

In [None]:
# Load data from the SQLite database
chunks = load_data_from_sqlite('/kaggle/input/ragdata/data.db')


In [None]:
# Function to create a vector store with deduplication
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Create unique IDs for each chunk based on its content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Deduplicate chunks based on unique IDs
    unique_ids = set()
    unique_chunks = []

    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)

    # Create Chroma vectorstore from unique chunks
    vectorstore = Chroma.from_documents(documents=unique_chunks,
                                        ids=list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory=vectorstore_path)

    # Persist the vector store for future use
    vectorstore.persist()

    return vectorstore

In [None]:
# Initialize the HuggingFace embedding model for Arabic
embedding_function = HuggingFaceEmbeddings(model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix")

# Set the vectorstore path to your Google Drive
vectorstore_path = "/kaggle/working/"

# Create and persist the vectorstore
vectorstore = create_vectorstore(chunks=chunks, embedding_function=embedding_function, vectorstore_path=vectorstore_path)


In [None]:

# Load the vectorstore again for querying
vectorstore = Chroma(persist_directory='/kaggle/input/vectorragdata', embedding_function=embedding_function)

# Create a retriever for similarity search
retriever = vectorstore.as_retriever(search_type="similarity",search_kwargs={ 'k': 5})

# Example query
query = "ابي لديه السكري وياخذ الانسولين هل عليه خطوره"
relevant_chunks = retriever.invoke(query)

# Print the relevant chunks (question-answer pairs)
for chunk in relevant_chunks:
    print(chunk.page_content)  # Accessing the page_content attribute


In [None]:
from langchain_groq import ChatGroq
llm = ChatGroq(model="llama-3.1-70b-versatile",
               api_key='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',
               temperature=0.6,
               max_tokens=1024)


# Function to handle user queries and generate responses
def answer_question(user_query):
    # Retrieve top 5 relevant chunks
    relevant_chunks = retriever.invoke(user_query) 

    # Prepare the prompt for the LLM
    relevant_chunks_text = "\n".join(chunk.page_content for chunk in relevant_chunks)
    prompt = f"""
    إليك بعض المعلومات المتعلقة بسؤالك في مجال الطب:

    {relevant_chunks_text}

    بناءً على هذه المعلومات، يُرجى الإجابة عن السؤال التالي:
    سؤالي هو: {user_query}
    """

    # Generate the response using the LLM
    response = llm.invoke(prompt)  # Use the correct method to generate response
    return response 

In [None]:
user_query = input("يرجى إدخال سؤالك: ")  # Ask the user for their question
output = answer_question(user_query)  # Get the answer
print("الإجابة:", output.content)  # Print the answer

In [None]:
llm.invoke('ابي لديه السكري وياخذ الانسولين هل عليه خطوره').content

In [None]:
import os
import json

from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()

os.environ['KAGGLE_USERNAME'] = secrets.get_secret("KAGGLE_USERNAME")
os.environ['KAGGLE_KEY'] = secrets.get_secret("KAGGLE_KEY")
# Change below
meta = dict(
    id="mohamedmahmod/VectorRAGdata",
    title="VectorRAG_data",
    isPrivate=False,
    licenses=[dict(name="other")]
)

with open('/kaggle/working/dataset-metadata.json', 'w') as f:
    json.dump(meta, f)


In [None]:
!kaggle datasets create -p "/kaggle/working" --dir-mode zip

In [None]:
!rm -rf /kaggle/working/chroma.sqlite3

In [None]:
!ls