In [5]:
import json, pymongo, os, re
%run ../.env/credentialsDB.py

In [6]:
class Connection:
    """Class that connects to the database creating all necessary methods for connection and closing connection, as well as the base database for adding new HTS records and the string_dict collection too
    """

    def __init__(self, db_path: str):
        """_init_ function of the class, defines the connection variables

        Args:
            db_path (str): Path to the database connection on MongoDB
        """

        self.client = pymongo.MongoClient(db_path)
        self.db = self.client['hts']
        self.collection_records = self.db['hts_records']
        self.collection_string_dict = self.db['string_dict']
        

    def closeConnection(self):
        """Close connection function, closes current connection created in the Connection class
        """
        self.client.close()

def addHTSRecord(folder_path: str, record: str, collection: pymongo.collection.Collection):
    """Specific function used by the createHTSDatabase() in order to add each document into the hts collection

    Args:
        folder_path (str): Path to folder where .json hts chapters are stored
        record (str): Name of file to be opened for addition
        collection (pymongo.collection.Collection): Pymongo collection object gathered from the Connection class
    """

    with open(f'{folder_path}{record}') as sec:


        data = json.load(sec)
        header = re.sub(r'\.json', '', record)

        hts_record = {
            'header': header,
            'data': data
        }

        try:
            collection.insert_one(hts_record)
        except:
            print(f'Error creating record in db for file {sec}')

def queryHTSString(chaps: list, hts_collection: pymongo.collection.Collection):
    """Function helper of addStrRecord() that adds the ObjectId of each of the chapters in the list created in the string_dict json object.

    Args:
        chaps (list): List of chapters to be queried in Mongo
        hts_collection (pymongo.collection.Collection): HTS collection already created and populated with chapter information for query

    Returns:
        list: Returns a list of ObjectIds from the hts collection from MongoDB to replace the string_dict list of chapters
    """

    ids = []

    for chap in chaps:

        #MongoDB returns a cursor iterator when we perform a find() query or aggregator as well, we need to also iterate it to gather the documents, or document
        cursor = hts_collection.find({ 'header': chap })

        for document in cursor:
            ids.append(document['_id'])
    
    return ids

def addStrRecord(str_chaps: list, key: str, connection: Connection):
    """Collects all the necessary information for the creation of each document in the string_dict collection

    Args:
        str_chaps (list): Chapter list from the original string_dict json file that will be converted in its corresponding ObjectIds from the HTS collection in MongoDB
        key (str): Key word to be added to the collection representing the document
        connection (Connection): Connection class object that connects to the MongoDB instance
    """

    try:
        record = {
            'string': key,
            'chaps': queryHTSString(str_chaps, connection.collection_records)
        }
        connection.collection_string_dict.insert_one(record)
    except Exception as e:
        print(f'Error creating record in db for str_dict key: {key}, error: {e}')


In [7]:
def createHTSDatabase(path_records: str, file_list: list, connection: Connection):
    """Main function to create the HTS database of records

    Args:
        path_records (str): Path to the .json divided record files
        file_list (list): List of filenames from the path_records for iteration
        connection (Connection): Connection class object that connects to the MongoDB instance
    """
    #Create Collection of records
    for file in file_list:
        print(f'Creating document {file} in hts_records collection...')
        addHTSRecord(path_records, file, connection.collection_records)

    print('Completed hts_records collection!')

def createStringDict(path_string_dict: str, connection: Connection):
    """Main function to create the string_dict collection in MongoDB

    Args:
        path_string_dict (str): Path to the json file with the string_dict information
        connection (Connection): Connection class object that connects to the MongoDB instance
    """

    #Create Collection of string_dict
    with open(f'{path_string_dict}string_dict.json', 'r') as string_file:

        string_dict_raw = json.load(string_file)

        for key in string_dict_raw.keys():
            print(f'Creating document for <{key}> in string_dict collection...')
            addStrRecord(string_dict_raw[key], key, connection)
            
        print('Completed creating string_dict collection!')


Execution of the database creation functions and definition of the paths and lists used. Do not use if DB is already created!

In [8]:
folder_path = '../db_hts/temp/NEW_test_files/'
string_folder_path = '../db_hts/temp/NEW_test_string_dict/'

file_list = os.listdir(folder_path)

connection = Connection(f'{PATH_DB}{USER_DB}:{PW_DB}@{CLUSTER_DB}')

createHTSDatabase(folder_path, file_list, connection)
createStringDict(string_folder_path, connection)

connection.closeConnection()

Creating document 0101.json in hts_records collection...
Creating document 0102.json in hts_records collection...
Creating document 0103.json in hts_records collection...
Creating document 0104.json in hts_records collection...
Creating document 0105.json in hts_records collection...
Creating document 0106.json in hts_records collection...
Creating document 0201.json in hts_records collection...
Creating document 0202.json in hts_records collection...
Creating document 0203.json in hts_records collection...
Creating document 0204.json in hts_records collection...
Creating document 0205.json in hts_records collection...
Creating document 0206.json in hts_records collection...
Creating document 0207.json in hts_records collection...
Creating document 0208.json in hts_records collection...
Creating document 0209.json in hts_records collection...
Creating document 0210.json in hts_records collection...
Creating document 0301.json in hts_records collection...
Creating document 0302.json in 