In [None]:
import pymongo
import os
import json
from dotenv import load_dotenv

load_dotenv()

In [None]:
_URI = f"mongodb://{os.environ['MONGODB_USERNAME']}:{os.environ['MONGODB_PASSWORD']}@{os.environ['MONGODB_HOST']}:{os.environ['MONGODB_PORT']}/{os.environ['MONGODB_DATABASE']}"

In [None]:
client = pymongo.MongoClient(_URI)
client.list_database_names()

In [None]:
db = client["function_calling"]
gorilla_openfunctions = db["gorilla_openfunctions"]
raw_functions = db["raw_functions"]

In [None]:
raw_functions.create_index(
    {
        'origin_dataset': 1,
        'split': 1,
        'line_no': 1,
        'function_number': 1,
    },
    unique=True
)
raw_functions.create_index(
    {
        "function": 1
    },
    unique=True,
)

In [None]:
total_functions_in_gorilla = gorilla_openfunctions.count_documents({})
print(f"Total raw functions in gorilla: {total_functions_in_gorilla}")

In [None]:
# Get all raw functions in gorilla one by one
ORIGIN_DATASET = "gorilla_openfunctions"
duplicate_count = 0
for raw_function in gorilla_openfunctions.find():
    # Get the function details
    split = raw_function["split"]
    line_no = raw_function["line"]
    
    # Get the function data
    if split == "test":
        function_data = [raw_function["data"]["function"]]
        
    elif split == "train":
        function_data = raw_function["data"]["Functions"]
        
    else:
        raise Exception("Invalid data split", split)
    
    
    # Dump functions to `raw_functions` collection
    for num, function in enumerate(function_data):
        try: 
            raw_functions.insert_one({
                "origin_dataset": ORIGIN_DATASET,
                "split": split,
                "line_no": line_no,
                "function_number": num,
                "function": function
            })
        except pymongo.errors.DuplicateKeyError:
            # print(f"Duplicate function found: {raw_function}")
            duplicate_count += 1
            continue

print(f"Total duplicate functions found: {duplicate_count}")

In [None]:
total_raw_functions = raw_functions.count_documents({})
print(f"Total raw functions: {total_raw_functions}")