In [None]:
import json
import re
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Open the JSON Lines file in read mode
with open(os.environ.get('TEST_FILE_PATH'), 'r') as f:
    json_lines = f.readlines()

In [None]:
questions = []
functions = []
model_answers = []

for i, json_line in enumerate(json_lines, start=1):
    """
    extract question, function and model answer from each row 
    and store it into the lists questions, functions and model_answers
    """
    try:
        # Parse the JSON line
        data = json.loads(json_line)
        
        # Extract the question, function, and model_answer fields
        questions.append(data['question'])
        functions.append(data['function'])
        model_answers.append(data['model_answer'])
    except json.JSONDecodeError:
        print(f"Skipping line {i} due to JSONDecodeError")

In [None]:
def parse_value(value):
    """
    it converts the string output of re.findall into their original datastructures
    """

    try:
        # Try to parse as integer
        return int(value)
    except ValueError:
        try:
            # Try to parse as float
            return float(value)
        except ValueError:
            # If it starts with a bracket, parse as a list
            if value.startswith('[') and value.endswith(']'):
                # Remove brackets and split by comma
                list_items = value[1:-1].split(',')
                # Parse each item in the list
                return [parse_value(item.strip()) for item in list_items]
            else:
                # Otherwise, return as a string, removing quotes if necessary
                return value.strip('\'"')

In [None]:
def check_type(value, dtype):
    """
    it corrects the datatype from number to integer or float
    """
    if dtype == "number" and type(value)==int:
        return "integer"
    elif dtype == "number" and type(value) == float:
        return "float"
    else:
        return dtype

In [None]:
def convert_value(value, value_type):
        """
        changes the datatype of value based on the datatype given in the data
        """
        if value_type == "string":
            return str(value)
        elif value_type == "integer":
            return int(value)
        elif value_type == "number" and type(value) == float:
             return float(value)
        elif value_type == "number" and type(value) == int:
             return int(value)
        elif value_type == "number":
             return int(value)
        elif value_type == "boolean":
            return value.lower() in ["true", "1", "yes"]
        elif value_type == "array" and isinstance(value, (str, int, float)):
            return [value]
        elif value_type == "array" and isinstance(value, list):
            return value
        elif value_type == "array":
            return list(value)
        else:
            return value

In [None]:
# pattern of function in model_answer
pattern = r'(\w+\.\w+)\((.*?)\)'
# pattern of parameters in model_answer
param_pattern = r'(\w+)\s*=\s*(["\'].*?["\']|\[\s*.*?\s*\]|[\w\.]+)'

In [None]:
# Initialize an empty list to store the JSON objects
json_data_list = []
json_data_list_indented = []
# Loop over the questions, functions, and model answers. 
# apply the transformations and dump each row in the json file
for i, question, function, model_answer in zip(range(len(questions)), questions, functions, model_answers):
    # Use the re.search function to search the string with the pattern
    match = re.search(pattern, model_answers[i])

    # The function name is the first group
    function_name = match.group(1)

    # The parameters are the second group, split by comma
    param_matches = re.findall(param_pattern, match.group(2))
    
    # Initialize an empty dictionary for the parameters
    parameters = {}
    
    # Initialize an empty list for the parameter strings
    param_strings = []

    for key, value in param_matches:
        # Parse the value
        parsed_value = parse_value(value)

        # Convert the value based on the type info
        converted_value = convert_value(parsed_value, functions[i]["parameters"]["properties"][key]["type"])

        # Change type from number to integer
        functions[i]["parameters"]["properties"][key]["type"] = check_type(converted_value, functions[i]["parameters"]["properties"][key]["type"])

        # Add the converted value to the parameters dictionary
        parameters[key] = converted_value

        # Create a string in the format "key=value" for this parameter and add it to the list
        param_string = f'{key}="{converted_value}"' if isinstance(converted_value, str) else f"{key}={converted_value}"
        param_strings.append(param_string)
        
    # Join the list of strings with commas and surround it with parentheses
    param_string = "(" + ", ".join(param_strings) + ")"

    # Combine the function name and the parameter string
    normal_answer = function_name + param_string

    data = {
        "user_query": question,
        "functions": [function],
        "model_answer_openai": {
            "api_call": function_name,
            "parameters": parameters
        },
        "model_answer_normal": normal_answer
    }

    # Convert the dictionary to a JSON string and add it to the list
    json_data_list.append(json.dumps(data))

with open('output_cleaned_better.jsonl', 'w') as f:
    for json_data in json_data_list:
        f.write(json_data + '\n')
