In [6]:

from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np
from tqdm import tqdm

class Embedder():
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    def __call__(self, input):
        input = self.model.encode(input).tolist()
        return input

class CustomChromaDB:
    def __init__(self):
        """
        Initialize the ChromaDB client with a specified path and embedding model.
        """
        self.embedding_model = Embedder()  # Assuming `Embedder` is a defined class or function for generating embeddings
        self.db_client = chromadb.PersistentClient(path='/content/drive/MyDrive/AT-lab5/ChromaDB')
        self.data_collection = self.db_client.get_or_create_collection(
            name="custom_coll",
            embedding_function=self.embedding_model
        )

    def upload_data(self, fragments, metadata, batch_size=20000):
        """
        Upload data fragments and their corresponding metadata to the ChromaDB collection in batches.

        :param fragments: List of text fragments to be uploaded.
        :param metadata: List of metadata entries corresponding to each fragment.
        :param batch_size: The number of fragments to process in one batch.
        """
        # Generate unique identifiers for the fragments
        fragment_ids = [str(index) for index in range(len(metadata))]

        total_batches = (len(fragments) + batch_size - 1) // batch_size  # Calculate the total number of batches
        end = 0
        for batch_idx in tqdm(range(total_batches), desc="Uploading data"):
            start = batch_idx * batch_size
            end = start + batch_size
            batch_fragments = fragments[start:end]
            batch_metadata = metadata[start:end]
            batch_ids = fragment_ids[start:end]

            # Add the batch of fragments to the collection
            self.data_collection.add(
                documents=batch_fragments,
                embeddings=self.embedding_model(batch_fragments),
                metadatas=batch_metadata,
                ids=batch_ids
            )

        # Add any remaining fragments that didn't fit into the last full batch
        remaining_fragments = fragments[end:]
        if remaining_fragments:
            self.data_collection.add(
                documents=remaining_fragments,
                embeddings=self.embedding_model(remaining_fragments),
                metadatas=metadata[end:],
                ids=fragment_ids[end:]
            )

        print("Dataset successfully uploaded to ChromaDB.")

    def search(self, text, count=1):
        """
        Search for the most similar documents to the given text in the ChromaDB collection.

        :param text: The query text to search for.
        :param count: The number of results to return.
        :return: A dictionary containing the search results.
        """
        vector = self.embedding_model(text)
        result = self.data_collection.query(
            query_embeddings=vector,
            n_results=count,
            include=['distances', 'embeddings', 'documents', 'metadatas'],
        )
        return result

In [None]:
import requests
import json

def make_post_request(prompt_text):
    # Define the request URL (Note: Include the actual API key in the URL)
    url = 'https://generativelanguage.googleapis.com'

    # Set request headers
    headers = {
        'Content-Type': 'application/json',
    }

    # Build the request payload
    payload = {
        "contents": [
            {
                "parts": [
                    {
                        "text": prompt_text
                    }
                ]
            }
        ]
    }

    try:
        # Send the POST request
        response = requests.post(url, headers=headers, data=json.dumps(payload))

        # Print the response object
        print(response)

        if response.status_code == 200:
            # If the request is successful, print the JSON response
            print(response.json())

            # Return the generated content
            return response.json()['candidates'][0]['content']['parts'][0]['text']
        else:
            # Print error message if the request fails
            print(f"Request failed with status code {response.status_code}: {response.text}")
    except requests.exceptions.RequestException as e:
        # Catch request exceptions and print the error
        print(f"An error occurred during the request: {e}")

In [None]:
def evaluate_question(question, desired_answer):
    result = cdb.search(question, 3)
    prompt = f"Context: {' '.join(result['documents'][0])}." + f"Question: {question}"
    answer = make_post_request(prompt)
    metric = bertscore.compute(predictions=[answer], references=[desired_answer], model_type="distilbert-base-uncased")
    print(answer)

In [None]:
evaluate_question('Who was the first person in space?', 'Yuri Gagarin')
evaluate_question('What is the meaning of life?', 'meanless')
evaluate_question('Who is the best basketball player?', 'Michael Jordan')
evaluate_question('What is Artificial Intelligence?', 'think like humans')
evaluate_question('What is a computer?', 'electronic device')