# Creating, Loading, and Querying Our Database

In [2]:
import os
import re
from langchain.embeddings import GooglePalmEmbeddings
from langchain.vectorstores import DeepLake

Creating a Class

In [3]:
class DeepLakeLoader:
    def __init__(self, source_data_path) -> None:
        self.source_data_path = source_data_path
        self.db_name = os.path.basename(source_data_path)
        self.data = self.split_data()

        if self.check_if_db_exists():
            self.db = self.load_db()
        else:
            self.db = self.create_db()

        
    def check_if_db_exists(self):
        pass


    def load_db(self):
        """Load the database if it already exists.
        Returns:
        Deeplake object"""

        return DeepLake(dataset_path=f'deeplake/{self.file_name}', embedding_function= GooglePalmEmbeddings(), read_only=True)


    def create_db(self):
        """Create the database if it doesnot already exists.
        Returns:
        Deeplake object"""

        return DeepLake.from_texts(self.data, GooglePalmEmbeddings(), dataset_path=f'deeplake/{self.file_name}')

    def split_data(self):
        """Preprocesses the data by splitting it into list of passsages.
        Returns:
            list of passages"""
        
        with open(self.source_data_path, 'r') as f:
            content = f.read()

        split_data = re.split(r'(?=\d+\. )', content)
        if split_data[0] == '':  
            split_data.pop(0)  
        # filters the entries in split_data to include only those with a length greater than or equal to 30 characters. 
        split_data = [entry for entry in split_data if len(entry)>=30]
        return split_data
    

    def query_db(self, query):
        """Query database for passages that are similar to the query
        Args:
        query(str): Query String
        Returns:
        List of passages that are similar to the query"""

        results = self.db.similarity_search(query, k=3)
        content = []
        for result in results:
            content.append(result.page_content)

        return content