In [11]:
import os
import json
import tiktoken
import scenic
import random
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from OutputParser import Scenic_output
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
from langchain_chroma import Chroma
from langchain.chains import LLMChain
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_openai import OpenAIEmbeddings
from scenic.simulators.carla import CarlaSimulator

In [4]:
# Basic Configurations for OpenAI
os.environ["OPEN_AI_API_KEY"] = "sk-gDa1bvL8Ian5Rkdq186bFbDeBf904447B4C707409e660dB6"
model_name = "gpt-3.5-turbo"
model = ChatOpenAI(
    model_name=model_name,
    max_tokens=2048,
    temperature=0,
    openai_api_key=os.environ["OPEN_AI_API_KEY"],
    openai_api_base="https://apikeyplus.com/v1"
)
structured_model = model.with_structured_output(Scenic_output)

In [5]:
# Some useful functions
###############################################
# Calculate the number of tokens in a message.#
###############################################
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301", include_final_response_prefix=True):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        #print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if "gpt-3.5" in model:
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted                                                                                                                                                                                             
    elif "gpt-4" in model:
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md \
        for information on how messages are converted to tokens.""")
    
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    if include_final_response_prefix:
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

################################################
# Convert the example to chat messages         #
################################################
# Example are in json formats and messages is in the format of the chat
def example_to_chat_messages(example):
    messages = [
        {"role": "system", "name": "example_user", 
         "content": '""" Scenario description\n%s\n"""' % example['docstring']},
        {"role": "system", "name": "example_assistant", 
         "content": '```##Map and Model##\n%s\n##Constants##\n%s\n##Moniters##\n%s\n##Defining Agent Behaviors##\n%s\n##Spatial Relations##\n%s\n##Scenario Specifications##\n%s\n##Background Activities##\n%s\n```' \
             % (example['map_and_model'], example['constants'], example['monitors'], example['behaviors'], example['spatial_relations'], example['scenario'], example['background'])}
    ]
    return messages

################################################
# Get the number of tokens for the example     #
################################################
# We need to construct the chat messages from the example and then calculate the number of tokens
def get_example_num_tokens(example):
    return num_tokens_from_messages(example_to_chat_messages(example), model_name, 
                                    include_final_response_prefix=False)

################################################
# Convert the input to chat messages           #
################################################
# Input is a string or a dictionary with docstring
# If the input is a sole string, then we should directly add it to Scenario Description
# If the input is a dictionary, then we should extract the docstring from it
def input_to_chat_messages(example):
    docstring = example if (isinstance(example, str)) else example['docstring']
    messages = [
        {"role": "user", 
         "content": '""" Scenario description\n%s\n"""' % docstring}
    ]
    return messages

################################################
# Get the number of tokens for the input       #
################################################
# We need to construct the chat messages from the input and then calculate the number of tokens
def get_input_num_tokens(example, include_final_response_prefix=True):
    return num_tokens_from_messages(input_to_chat_messages(example), model_name, 
                                    include_final_response_prefix=include_final_response_prefix)

The structure a formatted request in LLM should contain the following fields:
- _System Message_
    - This field will determine the role of the message in the system. It will be used to determine the type of message that is being sent.
- _Example Message_
    - This field will contain the format of the message that is being sent.
- _Prompt Message/Input Message_
    - This field will contain the prompt message that is being sent to the model. Based on the prompt message, the model will modify some of the field values in the Example Message and generate new scenarios.
- _Response Message_
    - This field will contain the response message that is generated by the model. This message will be generated based on the Example Message and Prompt Message.
- _RAG Status_
    - This field will contain the RAG status of the message. This will be used to determine the quality of the response generated by the model.

In [8]:
# System message information
system_message_str="""You are a helpful agent that generates specifications for car driving scenarios in the Scenic language
Scenic is a domain-specific probabilistic programming language for modeling the environments of cyber-physical systems like robots and autonomous cars. A Scenic program defines a distribution over scenes,\
    configurations of physical objects and agents; sampling from this distribution yields concrete scenes which can be simulated to produce training or testing data. Scenic can also define (probabilistic)\
    policies for dynamic agents, allowing modeling scenarios where agents take actions over time in response to the state of the world.

Your task is to generate Scenic scenarios, each according to its corresponding description in English included as a docstring. Write each scenario in a separate code box. Follow the examples below:"""

system_role = "system" if model_name.startswith("gpt-4") else "user"
system_message = {"role": system_role, "content": system_message_str}
system_message_len = num_tokens_from_messages([system_message], model_name, include_final_response_prefix=False)

print(system_message_len)

155


In [None]:
# Example message information
import os
from JsonFormatter import convert_file

folder_path = '../data_scenic/NHTSA_Scenarios'
output_dir = '../data_json/formatted'

for _, dirs, _ in os.walk(folder_path):
    for dir in dirs:
        temp_path = folder_path + '/' + dir
        for file_name in os.listdir(temp_path):
            # 拼接文件的完整路径
            file_path = os.path.join(temp_path, file_name)
            # 检查文件是否是.json文件
            if file_name.endswith('.scenic') and os.path.isfile(file_path):
                convert_file(file_path, output_dir)

examples = []

formatted_dir = '../data_json/formatted'

for file_name in os.listdir(formatted_dir):
    file_path = os.path.join(formatted_dir, file_name) 
    with open(file_path, "r") as f:
        scenic_doc = json.load(f) # Load the input from the JSON file
        del scenic_doc['has_docstring']
        del scenic_doc['name']
        examples.append(scenic_doc)


In [12]:
# Prompt message information/Input message information


We will use LangChain Few Shots Examples features to train our LLM.

In [None]:
examples_prompt = ChatPromptTemplate.from_messages(
    [
        ("docstring", "{docstring}"),
        ("map_and_model", "{map_and_model}"),
        ("constants", "{constants}"),
        ("monitors", "{monitors}"),
        ("behaviors", "{behaviors}"),
        ("spatial_relations", "{spatial_relations}"),
        ("scenario", "{scenario}"),
        ("background", "{background}")
    ]
)
to_vectorize = [" ".join(example.values()) for example in examples]
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=examples_prompt,
    example_selector=SemanticSimilarityExampleSelector(
        vectorstore=Chroma.from_texts(to_vectorize, OpenAIEmbeddings(), metadatas=examples),
        vectorize=True,
        k = 2
    ),
    example_filter=None,
    example_replacement=None,
    examples=examples,
)
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message_str),
        few_shot_prompt
    ]
)
chain = LLMChain(model, final_prompt, max_tokens=2048, temperature=0, top_p=1, top_k=0, max_tokens_per_message=2048)
chain.run()

In [13]:
def output_to_string(output: Scenic_output):
    scenic_str = '##Map and Model##\n%s\n##Constants##\n%s\n##Moniters##\n%s\n##Defining Agent Behaviors##\n%s\n##Spatial Relations##\n%s\n##Scenario Specifications##\n%s\n##Background Activities##\n%s\n' \
             % (output.map_and_model, output.constants, output.monitors, output.behaviors, output.spatial_relations, output.scenario, output.background)
    return scenic_str

def compile_scenic(scenic_str):
    random.seed()
    scenario = scenic.scenarioFromString(scenic_str, model='scenic.simulators.carla', mode2D=True)
    scene, numIters = scenario.generate(maxIterations=1000)
    return scene, numIters

def simulate(scene, numIters):
    sim = CarlaSimulator()
    simulation = sim.simulate(scene, maxIterations=numIters)
    if simulation:
        result = simulation.result
        for i, state in enumerate(result.trajectory):
            egoPos, parkedCarPos = state
            print(f'Time step {i}: ego at {egoPos}; parked car at {parkedCarPos}')
        return True
    else:
        print('Simulation failed')
        return False