In [19]:
import os
import json
import tiktoken
import scenic
import random
from langchain_openai import ChatOpenAI, OpenAI, OpenAIEmbeddings
from langchain_core.pydantic_v1 import BaseModel, Field
from OutputParser import Scenic_output
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_chroma import Chroma
from langchain.chains import LLMChain
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_core.documents.base import Document
from langchain_community.embeddings import LlamaCppEmbeddings
# from scenic.simulators.carla import CarlaSimulator

In [20]:
# Basic Configurations for OpenAI
os.environ["OPENAI_API_KEY"] = "sk-7be80d5f4b73475d9ab5e1e998458cec"
base_url = "https://api.deepseek.com/v1"
model_name = "deepseek-coder"
model = ChatOpenAI(
    model_name=model_name,
    max_tokens=2048,
    temperature=0,
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_base= base_url
)
structured_model = model.with_structured_output(Scenic_output)

In [21]:
# Some useful functions
###############################################
# Calculate the number of tokens in a message.#
###############################################
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301", include_final_response_prefix=True):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        #print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if "gpt-3.5" in model:
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted                                                                                                                                                                                             
    elif "gpt-4" in model or "deepseek" in model:
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md \
        for information on how messages are converted to tokens.""")
    
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    if include_final_response_prefix:
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

################################################
# Convert the example to chat messages         #
################################################
# Example are in json formats and messages is in the format of the chat
def example_to_chat_messages(example):
    messages = [
        {"role": "system", "name": "example_user", 
         "content": '""" Scenario description\n%s\n"""' % example['docstring']},
        {"role": "system", "name": "example_assistant", 
         "content": '```##Map and Model##\n%s\n##Constants##\n%s\n##Moniters##\n%s\n##Defining Agent Behaviors##\n%s\n##Spatial Relations##\n%s\n##Scenario Specifications##\n%s\n##Background Activities##\n%s\n```' \
             % (example['map_and_model'], example['constants'], example['monitors'], example['behaviors'], example['spatial_relations'], example['scenario'], example['background'])}
    ]
    return messages

################################################
# Get the number of tokens for the example     #
################################################
# We need to construct the chat messages from the example and then calculate the number of tokens
def get_example_num_tokens(example):
    return num_tokens_from_messages(example_to_chat_messages(example), model_name, 
                                    include_final_response_prefix=False)

################################################
# Convert the input to chat messages           #
################################################
# Input is a string or a dictionary with docstring
# If the input is a sole string, then we should directly add it to Scenario Description
# If the input is a dictionary, then we should extract the docstring from it
def input_to_chat_messages(example):
    docstring = example if (isinstance(example, str)) else example['docstring']
    messages = [
        {"role": "user", 
         "content": '""" Scenario description\n%s\n"""' % docstring}
    ]
    return messages

################################################
# Get the number of tokens for the input       #
################################################
# We need to construct the chat messages from the input and then calculate the number of tokens
def get_input_num_tokens(example, include_final_response_prefix=True):
    return num_tokens_from_messages(input_to_chat_messages(example), model_name, 
                                    include_final_response_prefix=include_final_response_prefix)

The structure a formatted request in LLM should contain the following fields:
- _System Message_
    - This field will determine the role of the message in the system. It will be used to determine the type of message that is being sent.
- _Example Message_
    - This field will contain the format of the message that is being sent.
- _Prompt Message/Input Message_
    - This field will contain the prompt message that is being sent to the model. Based on the prompt message, the model will modify some of the field values in the Example Message and generate new scenarios.
- _Response Message_
    - This field will contain the response message that is generated by the model. This message will be generated based on the Example Message and Prompt Message.
- _RAG Status_
    - This field will contain the RAG status of the message. This will be used to determine the quality of the response generated by the model.

In [22]:
# System message information
system_message_str="""You are a helpful agent that generates specifications for car driving scenarios in the Scenic language
Scenic is a domain-specific probabilistic programming language for modeling the environments of cyber-physical systems like robots and autonomous cars. A Scenic program defines a distribution over scenes,\
configurations of physical objects and agents; sampling from this distribution yields concrete scenes which can be simulated to produce training or testing data. Scenic can also define (probabilistic)\
policies for dynamic agents, allowing modeling scenarios where agents take actions over time in response to the state of the world.\

Your task is to generate Scenic scenarios, each according to its corresponding description in English included as a docstring. \
Please refer all the codes to the given few-shot examples and reference guide syntax. Some self-made APIs are not implemented in Scenic language:"""

system_role = "system"
system_message = {"role": system_role, "content": system_message_str}
system_message_len = num_tokens_from_messages([system_message], model_name, include_final_response_prefix=False)
# system_message_len = num_tokens_from_messages([system_message], include_final_response_prefix=False)
print(system_message_len)

167


In [23]:
# Example message information
import os
from JsonFormatter import convert_file

folder_path = '../data_scenic/NHTSA_Scenarios'
output_dir = '../data_json/formatted'

for _, dirs, _ in os.walk(folder_path):
    for dir in dirs:
        temp_path = folder_path + '/' + dir
        for file_name in os.listdir(temp_path):
            # Full path to the file
            file_path = os.path.join(temp_path, file_name)
            # Test if the file is a .scenic file
            if file_name.endswith('.scenic') and os.path.isfile(file_path):
                convert_file(file_path, output_dir)

examples = []

formatted_dir = '../data_json/formatted'

for file_name in os.listdir(formatted_dir):
    file_path = os.path.join(formatted_dir, file_name) 
    with open(file_path, "r") as f:
        scenic_doc = json.load(f) # Load the input from the JSON file
        del scenic_doc['has_docstring']
        del scenic_doc['name']
        for key in scenic_doc.keys():
            if scenic_doc[key] == None:
                scenic_doc[key] = ""
        examples.append(scenic_doc)


In [73]:
# Prompt message information/Input message information
input_message_str = """
Ego car turning left into another road in a intersaction. Only one car in this scenario.
"""

input_message = {"docstring": input_message_str}
input_message_len = num_tokens_from_messages([input_message], model_name, include_final_response_prefix=False)
# input_message_len = num_tokens_from_messages([input_message], include_final_response_prefix=False)

We will use LangChain Few Shots Examples features to train our LLM.

In [25]:
# Load the scenario examples
to_vectorize = [" ".join(example.values()) for example in examples]

embedding=LlamaCppEmbeddings(
        model_path="/home/loring/Documents/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q4_K_M.gguf",
        n_gpu_layers=32,
        n_batch=512
    )

example_vs = Chroma.from_texts(
    embedding=embedding,
    texts=to_vectorize,
    metadatas=examples,
    collection_name="example"
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/loring/Documents/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 ll

In [26]:
# Load the reference document as knowledge database
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
source="../database"
source_path = "../document/output-1.json"
CHUNK_SIZE=1000
CHUNK_OVERLAP=20

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["title"] = record.get("title")
    metadata["content"] = record.get("html")
    return metadata

def load_json():
    loader = JSONLoader(
        file_path=source_path,
        jq_schema=".[]",
        content_key="html",
        metadata_func=metadata_func,
    )
    loaded_json = loader.load()
    return loaded_json

def split_documents(loaded_docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )
    chunked_docs = splitter.split_documents(loaded_docs)
    return chunked_docs

chunk_docs = load_json()   
# chunk_docs = split_documents(docs)

docs_vs = Chroma.from_documents(
    documents=chunk_docs,
    embedding=embedding,
    collection_name="docs"
)



llama_print_timings:        load time =     116.89 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     128.57 ms /   512 tokens (    0.25 ms per token,  3982.20 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     155.57 ms /   513 tokens

llama_print_timings:        load time =     116.89 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     128.20 ms /   512 tokens (    0.25 ms per token,  3993.73 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     180.15 ms /   513 tokens

llama_print_timings:     

In [27]:
print(chunk_docs[0])

page_content="latest\n\nINTRODUCTION\n\nGetting Started with Scenic\nNotes on Installing Scenic\nWhat’s New in Scenic\n\nTUTORIALS\n\nScenic Fundamentals\nDynamic Scenarios\nComposing Scenarios\n\nLANGUAGE AND TOOL REFERENCE\n\nSyntax Guide\nLanguage Reference\nCommand-Line Options\nUsing Scenic Programmatically\nDeveloping Scenic\nScenic Internals\nHow Scenic is Compiled\nGuide to the Scenic Parser & Compiler\nScenic Grammar\nScenic Modules\nscenic.core\nscenic.domains\nscenic.domains.driving\nscenic.formats\nscenic.simulators\nscenic.syntax\n\nLIBRARIES AND SIMULATORS\n\nScenic Libraries\nSupported Simulators\nInterfacing to New Simulators\n\nGENERAL INFORMATION\n\nPublications Using Scenic\nCredits\n Scenic Internals scenic.domains scenic.domains.driving scenic.domains.driving.actions\n Edit on GitHub\nscenic.domains.driving.actions\uf0c1\n\nActions for dynamic agents in the driving domain.\n\nThese actions are automatically imported when using the driving domain.\n\nThe RegulatedCo

In [28]:
examples_prompt = PromptTemplate(
    input_variables=[
        'docstring', 'map_and_model', 'constants', 'monitors', 'behaviors', 'spatial_relations', 'scenario'
    ],
    template="#Scenario description#\n{docstring}\n\n##Map and Model##\n{map_and_model}\n##Constants##\n{constants}\n##Moniters##\n{monitors}\n##Defining Agent Behaviors##\n{behaviors}\n##Spatial Relations##\n{spatial_relations}\n##Scenario Specifications##\n{scenario}\n"
)

doc_prompt = PromptTemplate(
    input_variables=['title', 'content'],
    template="You can reference the instructions in the following documentation: \n{title} \n{content}"
)
 
example_selector = SemanticSimilarityExampleSelector(vectorstore=example_vs, k=5)
doc_selector = SemanticSimilarityExampleSelector(vectorstore=docs_vs, k=2)
selected_reference = doc_selector.select_examples({"docstring":"syntax"})
print(selected_reference)

doc_str = system_message_str + "\n"
for i in selected_reference:
    doc_str += doc_prompt.format(title=i["title"], content=i["content"])
    doc_str += "\n"

few_shot_prompt = FewShotPromptTemplate(
    example_prompt=examples_prompt,
    example_selector=example_selector,
    prefix=doc_str,
    suffix="Generate scenarios based on following description: {docstring}",
    input_variables=["docstring"],
)

# Final prompt message is composed of system message, example message and input message
final_prompt = few_shot_prompt
print(final_prompt)


llama_print_timings:        load time =     116.89 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =      60.93 ms /     2 tokens (   30.46 ms per token,    32.83 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =      61.29 ms /     3 tokens


[{'content': "latest\n\nINTRODUCTION\n\nGetting Started with Scenic\nNotes on Installing Scenic\nWhat’s New in Scenic\n\nTUTORIALS\n\nScenic Fundamentals\nDynamic Scenarios\nComposing Scenarios\n\nLANGUAGE AND TOOL REFERENCE\n\nSyntax Guide\nLanguage Reference\nCommand-Line Options\nUsing Scenic Programmatically\nDeveloping Scenic\nScenic Internals\nHow Scenic is Compiled\nGuide to the Scenic Parser & Compiler\nScenic Grammar\nScenic Modules\nscenic.core\nscenic.domains\nscenic.domains.driving\nscenic.formats\nscenic.simulators\nscenic.syntax\n\nLIBRARIES AND SIMULATORS\n\nScenic Libraries\nSupported Simulators\nInterfacing to New Simulators\n\nGENERAL INFORMATION\n\nPublications Using Scenic\nCredits\n Scenic Internals scenic.domains scenic.domains.driving scenic.domains.driving.actions\n Edit on GitHub\nscenic.domains.driving.actions\uf0c1\n\nActions for dynamic agents in the driving domain.\n\nThese actions are automatically imported when using the driving domain.\n\nThe RegulatedCo

In [74]:
chain = LLMChain(llm=model, prompt=final_prompt)
result = chain.run(input_message_str)  


llama_print_timings:        load time =     116.89 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     134.28 ms /    25 tokens (    5.37 ms per token,   186.18 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     140.16 ms /    26 tokens


In [75]:

print(result)

 ```python
#Scenario description#
""" Scenario Description
Ego car turning left into another road in an intersection. Only one car in this scenario.
"""

##Map and Model##

param map = localPath('../../../assets/maps/CARLA/Town01.xodr')  # or other CARLA map that definitely works
param carla_map = 'Town01'
model scenic.simulators.carla.model


##Constants##

EGO_MODEL = "vehicle.lincoln.mkz_2017"
EGO_SPEED = 5


##Moniters##

##Defining Agent Behaviors##

behavior EgoBehavior():
    try:
        do FollowLaneBehavior(target_speed=EGO_SPEED)
    interrupt when atIntersection():
        take TurnLeftAction()


##Spatial Relations##

intersection = Uniform(*filter(lambda i: i.is4Way or i.is3Way, network.intersections))

egoManeuver = Uniform(*filter(lambda m: m.type is ManeuverType.LEFT_TURN, intersection.maneuvers))
egoInitLane = egoManeuver.startLane
egoSpawnPt = new OrientedPoint in egoInitLane.centerline


##Scenario Specifications##

ego = new Car at egoSpawnPt,
    with blueprint EG

In [31]:
def output_to_string(output: Scenic_output):
    scenic_str = '##Map and Model##\n%s\n##Constants##\n%s\n##Moniters##\n%s\n##Defining Agent Behaviors##\n%s\n##Spatial Relations##\n%s\n##Scenario Specifications##\n%s\n##Background Activities##\n%s\n' \
             % (output.map_and_model, output.constants, output.monitors, output.behaviors, output.spatial_relations, output.scenario, output.background)
    return scenic_str

import traceback
from scenic.core.errors import ScenicSyntaxError
def compile_scenic(scenic_str):
    try:
        random.seed()
        scenario = scenic.scenarioFromString(scenic_str, model='scenic.simulators.carla', mode2D=True)
        scene, numIters = scenario.generate(maxIterations=1000)
        return scene, numIters
    except ScenicSyntaxError as e:
        query = str(e) + scenic_str
        print(query)
        return model.invoke(query)
    except Exception as en:
        return en


In [41]:
import re

maps = [
    "Town01",
    "Town02",
    "Town03",
    "Town04",
    "Town05",
    "Town10HD",
]

carModels = [
    "vehicle.audi.a2",
    "vehicle.audi.etron",
    "vehicle.audi.tt",
    "vehicle.bmw.grandtourer",
    "vehicle.chevrolet.impala",
    "vehicle.citroen.c3",
    "vehicle.dodge.charger_police",
    "vehicle.jeep.wrangler_rubicon",
    "vehicle.lincoln.mkz_2017",
    "vehicle.mercedes.coupe",
    "vehicle.mini.cooper_s",
    "vehicle.ford.mustang",
    "vehicle.nissan.micra",
    "vehicle.nissan.patrol",
    "vehicle.seat.leon",
    "vehicle.tesla.model3",
    "vehicle.toyota.prius",
    "vehicle.volkswagen.t2",
]

def modify_output(scenic_code_str):
    output_str = scenic_code_str
    town_str = random.choice(maps)
    map_str = 'param map = localPath(\'../../assets/maps/CARLA/' + town_str + '.xodr\')' + '\\n'
    output_str = re.sub(r'^.*localPath.*\n', map_str, output_str, flags=re.MULTILINE)    # find string like localpath, turn into existing paths(file path should be in Scenic)
    output_str = re.sub(r'^.*carla_map.*\n', 'param carla_map = \'' + town_str + '\'\\n', output_str, flags=re.MULTILINE)
    output_str = re.sub(r'^(\s*)model scenic.[\S]*', r'\1model scenic.simulators.carla.model', output_str, flags=re.MULTILINE)  # use carla as simulator
    output_str = re.sub(r'^(\s*)MODEL = [\S]*', r'\1MODEL = ' + '\'' + random.choice(carModels) + '\'', output_str, flags=re.MULTILINE)    # modify the model name (pedestrian not included!!!)
    output_str = re.sub(r'^.*```.*\n', '', output_str, flags=re.MULTILINE) # delete python and ```s
    output_str = re.sub(r'\n[^\n]*$', '', output_str) #delete last line
    return output_str

In [76]:
modified_result = modify_output(result)
print(modified_result)

#Scenario description#
""" Scenario Description
Ego car turning left into another road in an intersection. Only one car in this scenario.
"""

##Map and Model##

param map = localPath('../../assets/maps/CARLA/Town07.xodr')
param carla_map = 'Town07'
model scenic.simulators.carla.model


##Constants##

EGO_MODEL = "vehicle.lincoln.mkz_2017"
EGO_SPEED = 5


##Moniters##

##Defining Agent Behaviors##

behavior EgoBehavior():
    try:
        do FollowLaneBehavior(target_speed=EGO_SPEED)
    interrupt when atIntersection():
        take TurnLeftAction()


##Spatial Relations##

intersection = Uniform(*filter(lambda i: i.is4Way or i.is3Way, network.intersections))

egoManeuver = Uniform(*filter(lambda m: m.type is ManeuverType.LEFT_TURN, intersection.maneuvers))
egoInitLane = egoManeuver.startLane
egoSpawnPt = new OrientedPoint in egoInitLane.centerline


##Scenario Specifications##

ego = new Car at egoSpawnPt,
    with blueprint EGO_MODEL,
    with behavior EgoBehavior()

require (distanc

In [77]:
i = 9
output_file_path = '../results/single_actor/sa_result' + str(i) + '.scenic'
try:
    with open(output_file_path, 'x') as file:
        file.write(modified_result)
except FileExistsError:
    print(f"File {output_file_path} already exists")