In [None]:

from google.colab import drive
drive.mount('/content/drive')
!tar -xvf /content/20news-bydate.tar.gz


In [None]:
import re
import os
from tqdm import tqdm

def divide_text_with_overlap(text, part_size=100, overlap_size=20):
    """
    Divide a text into parts with a specified size and overlap.
    
    :param text: The input text to be divided.
    :param part_size: The size of each part.
    :param overlap_size: The number of overlapping characters between parts.
    :return: A list of text parts.
    """
    parts = []
    position = 0
    while position < len(text):
        section = text[position:position + part_size]
        if len(section) == part_size:
            parts.append(section)
        position += part_size - overlap_size  # Step considering the overlap
    return parts

def retrieve_author_info(content):
    """
    Retrieve the author information from the content using a regular expression.
    
    :param content: The content from which to extract the author information.
    :return: The author's name or "Unknown" if not found.
    """
    author_match = re.search(r'From:(.*?)(?=\w+:|$)', content, re.DOTALL)
    if author_match:
        return author_match.group(1).strip()
    return "Unknown"

# Main function for processing the dataset
def handle_dataset(path_to_dataset, part_size=100):
    """
    Process the dataset by dividing the text files into fragments and extracting metadata.
    
    :param path_to_dataset: The path to the dataset directory.
    :param part_size: The size of each fragment.
    :return: A tuple containing two lists: text fragments and corresponding metadata.
    """
    text_fragments = []
    meta_data = []
    folder_list = os.listdir(path_to_dataset)

    for folder_name in tqdm(folder_list, desc="Processing folders"):
        current_folder = os.path.join(path_to_dataset, folder_name)
        file_list = os.listdir(current_folder)

        for file_name in file_list:
            file_location = os.path.join(current_folder, file_name)
            with open(file_location, 'r', encoding='latin1') as opened_file:
                raw_text = opened_file.read()
                processed_text = re.sub(r'[^\w\s.,!?-]', '', raw_text.replace('\t', ' ').replace('\n', ' '))  # Remove special characters
                processed_text = re.sub(r'\s+', ' ', processed_text)  # Remove extra spaces
                sections = divide_text_with_overlap(processed_text, part_size)
                found_author = retrieve_author_info(raw_text)

                for fragment_id, section in enumerate(sections):
                    metadata_entry = {
                        'category': folder_name,
                        'fragment_id': f'{file_name}-{fragment_id}',
                        'author': found_author
                    }
                    meta_data.append(metadata_entry)
                    text_fragments.append(section)

    return text_fragments, meta_data

dataset_directory = '/content/20news-bydate-test'
fragments, metadata = handle_dataset(dataset_directory)

In [None]:
!pip install -U sentence-transformers
!pip install chromadb

In [None]:

from sentence_transformers import SentenceTransformer

class Embedder():
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    def __call__(self, input):
        input = self.model.encode(input).tolist()
        return input

# embedder = Embedder()



In [6]:
import chromadb
import numpy as np
from tqdm import tqdm

class CustomChromaDB:
    def __init__(self):
        """
        Initialize the ChromaDB client with a specified path and embedding model.
        """
        self.embedding_model = Embedder()  # Assuming `Embedder` is a defined class or function for generating embeddings
        self.db_client = chromadb.PersistentClient(path='/content/drive/MyDrive/AT-lab5/ChromaDB')
        self.data_collection = self.db_client.get_or_create_collection(
            name="custom_coll",
            embedding_function=self.embedding_model
        )

    def upload_data(self, fragments, metadata, batch_size=20000):
        """
        Upload data fragments and their corresponding metadata to the ChromaDB collection in batches.

        :param fragments: List of text fragments to be uploaded.
        :param metadata: List of metadata entries corresponding to each fragment.
        :param batch_size: The number of fragments to process in one batch.
        """
        # Generate unique identifiers for the fragments
        fragment_ids = [str(index) for index in range(len(metadata))]
        
        total_batches = (len(fragments) + batch_size - 1) // batch_size  # Calculate the total number of batches
        end = 0
        for batch_idx in tqdm(range(total_batches), desc="Uploading data"):
            start = batch_idx * batch_size
            end = start + batch_size
            batch_fragments = fragments[start:end]
            batch_metadata = metadata[start:end]
            batch_ids = fragment_ids[start:end]

            # Add the batch of fragments to the collection
            self.data_collection.add(
                documents=batch_fragments,
                embeddings=self.embedding_model(batch_fragments),
                metadatas=batch_metadata,
                ids=batch_ids
            )

        # Add any remaining fragments that didn't fit into the last full batch
        remaining_fragments = fragments[end:]
        if remaining_fragments:
            self.data_collection.add(
                documents=remaining_fragments,
                embeddings=self.embedding_model(remaining_fragments),
                metadatas=metadata[end:],
                ids=fragment_ids[end:]
            )

        print("Dataset successfully uploaded to ChromaDB.")

    def search(self, text, count=1):
        """
        Search for the most similar documents to the given text in the ChromaDB collection.

        :param text: The query text to search for.
        :param count: The number of results to return.
        :return: A dictionary containing the search results.
        """
        vector = self.embedding_model(text)
        result = self.data_collection.query(
            query_embeddings=vector,
            n_results=count,
            include=['distances', 'embeddings', 'documents', 'metadatas'],
        )
        return result

In [None]:
cdb = CustomChromaDB()
cdb.upload_data(fragments, metadata)


In [None]:
cdb = CustomChromaDB()
questions = [
]
count = 0
for question in questions:
    result = cdb.search(question[0], question[1])
    print(count)
    print(question[0])
    print(result['documents'])
    print()
    count +=1

In [None]:
import pinecone
import numpy as np
from tqdm import tqdm

class CustomPinecone():
    def __init__(self, api_key, environment, index_name, embedding_model):
        # Initialize Pinecone client
        pinecone.init(api_key=api_key, environment=environment)

        # Check if the specified index already exists, create it if not
        if index_name not in pinecone.list_indexes():
            pinecone.create_index(index_name, dimension=embedding_model.dimension)

        self.index = pinecone.Index(index_name)
        self.embedding_model = embedding_model  # Assuming Embedder() is a class that can generate embeddings

    def upload_data(self, fragments, metadata, batch_size=100):
        # Associate metadata with fragments and prepare for upload
        items_to_upload = []
        for i, (fragment, meta) in enumerate(zip(fragments, metadata)):
            vector = self.embedding_model(fragment).tolist()
            item = (str(i), vector, meta)
            items_to_upload.append(item)

            # Upload data in batches
            if len(items_to_upload) == batch_size:
                self.index.upsert(vectors=items_to_upload)
                items_to_upload = []

        # Upload remaining data
        if items_to_upload:
            self.index.upsert(vectors=items_to_upload)

        print("Dataset successfully uploaded to Pinecone.")

    def search(self, text, top_k=1):
        # Generate query vector
        query_vector = self.embedding_model(text).tolist()
        # Query the index and return results
        results = self.index.query(queries=[query_vector], top_k=top_k, include_metadata=True)
        return results

In [None]:
cpdb = CustomPinecone()
questions = [
]
count = 0
for question in questions:
    result = cpdb.search(question[0], question[1])
    print(count)
    print(question[0])
    print(result['documents'])
    print()
    count +=1