In [None]:
%%capture
%pip install dspy-ai
%pip install pinecone-client

In [35]:
PINECONE_API_KEY= os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

In [46]:
import os
from pinecone import Index
import pinecone

DIMENSION = 768

def pinecone_init(
    index_name : str = 'scenic',
    verbose : bool = False,
) -> Index:
    api_key = os.getenv('PINECONE_API_KEY', PINECONE_API_KEY)
    pinecone.init(
        api_key=api_key,
        environment='gcp-starter',
    )

    active_indexes = pinecone.list_indexes()

    if index_name not in active_indexes:
        pinecone.create_index(
            name=index_name, 
            metric='dotproduct',
            dimension=DIMENSION,
        )

    index = pinecone.Index(index_name)
    if verbose:
        print(f"Index statistics: \n{index.describe_index_stats()}")
    return index

PINECONE_INDEX = pinecone_init(index_name='scenic', verbose=True)

Index statistics: 
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [28]:
# 1. Collect all scenic programs in a list
# 2. Collect all scenic documents (which are rst files) in a list
#   We will chunk the documents by the rst heading separator which is a line of dashes

import os
import re

def get_scenic_programs(
    scenic_dir : str = '../../Scenic',
) -> list:
    programs = []
    for root, dirs, files in os.walk(scenic_dir):
        for file in files:
            if file.endswith('.scenic'):
                programs.append(os.path.join(root, file))
    return programs

def get_scenic_documents(
    scenic_dir : str = '../../Scenic',
) -> list:
    documents = []
    for root, dirs, files in os.walk(scenic_dir):
        for file in files:
            if file.endswith('.rst'):
                documents.append(os.path.join(root, file))
    return documents

def chunk_rst_file(
    rst_file : str,
) -> list:
    with open(rst_file, 'r') as f:
        rst = f.read()
    chunks = re.split(r'^-{10,}$', rst, flags=re.MULTILINE)
    return chunks

def get_and_chunk_scenic_documents(
    scenic_dir : str = '../../Scenic',
) -> list:
    documents = get_scenic_documents(scenic_dir)
    chunks = []
    for document in documents:
        chunks.extend(chunk_rst_file(document))
    return chunks

In [24]:
from typing import List
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else
    'mps' if torch.backends.mps.is_available()
    else 'cpu'
)
print(f"Using device: {device}")

# We will use mean pooling to accumulate the attention weights
def mean_pooling(
    model_output,
    attention_mask,
):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

model_name = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def get_embedding(
    text : str,
) -> List[float]:
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input.to(device))

    embeddings = mean_pooling(model_output, encoded_input['attention_mask'].to(device))
    normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return normalized_embeddings.cpu().numpy().tolist()[0]

# let's see the embeddings for a few sentences
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "The five boxing wizards jump quickly.",
]

for sentence in sentences:
    print(len(get_embedding(sentence)))
    

Using device: cuda:0
768
768


In [47]:
import time
from tqdm.auto import trange

def pinecone_upsert(
    docs : list,
    index : Index,
    start : int = 0
) -> None:
    vectors = []
    for i in trange(len(docs)):

        doc = docs[i]
        id_batch = str(i + start)
        embedding = get_embedding(doc)
        metadata = {'text' : doc}
        vectors.append((id_batch, embedding, metadata))

    if len(vectors) > 0:
        index.upsert(vectors)

scenic_docs = get_and_chunk_scenic_documents()
pinecone_upsert(
    docs=scenic_docs,
    index=PINECONE_INDEX,
)

scenic_programs = get_scenic_programs()
pinecone_upsert(
    docs=scenic_programs,
    index=PINECONE_INDEX,
    start=len(scenic_docs) + 1
)



100%|██████████| 187/187 [00:01<00:00, 100.44it/s]
100%|██████████| 193/193 [00:01<00:00, 165.54it/s]


In [77]:
from typing import Optional, Union
import dspy
from dsp.utils import dotdict

PINECONE_INDEX = index

class PineconeRM(dspy.Retrieve):
    """
    Retrieve model for Pinecone
    Example:
        self.retrieve = PineconeRM(k=num_passages)
    """
    def __init__(self, index, k=3):
        super().__init__(k=k)
        self.index = index

    def forward(self, query_or_queries : Union[str, List[str]], k: Optional[int] = None) -> dspy.Prediction:
        if isinstance(query_or_queries, str):
            query_or_queries = [query_or_queries]
        
        query_embeddings = get_embedding(query_or_queries)
        k = k if k else self.k
        results_dict = self.index.query(query_embeddings, top_k=k, include_metadata=True)

        passages = [dotdict({"long_text": result['metadata']['text']}) for result in results_dict['matches']]
        return passages

turbo = dspy.OpenAI(model='gpt-3.5-turbo-1106')
pinecone_scenic_rm = PineconeRM(PINECONE_INDEX, k=7)

dspy.settings.configure()
dspy.settings.configure(lm=turbo, rm=pinecone_scenic_rm)

In [56]:
# We will make our own question and answer dataset pairs
# we read in the existing programs from the outputs/predict_few_shot folder
# Anything before "param map" is the question, anything after including "param map" is the answer

import os
from typing import List

num_samples = 200

class Example:
    def __init__(self, question : str, answer : str):
        self.question = question
        self.answer = answer

    def __str__(self):
        return f'Question: \n{self.question}\nAnswer: \n{self.answer}'

    def __repr__(self):
        return self.__str__()

def build_dataset(path : str, num_samples : int) -> List[Example]:
    dataset = []
    for file in os.listdir(path)[:num_samples]:
        with open(os.path.join(path, file), 'r') as f:
            program = f.read()
            description = program.split('param map')[0]
            description = description.replace('#', '')

            scenic = program.split('param map')[1]
            scenic = 'param map' + scenic

            ex = Example(description, scenic)
            dspy_example = dspy.Example(question=ex.question, answer=ex.answer)
            dspy_example = dspy_example.with_inputs('question')
            
            dataset.append(dspy_example)
    return dataset

dataset = build_dataset('../outputs/predict_few_shot', num_samples)
print("An example from the dataset:")
train_example = dataset[0]
print(train_example)
print(f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}")

# We can split the dataset into train and test sets 80/20
import random
random.shuffle(dataset)
split_percentage = 0.8
split_index = int(len(dataset) * split_percentage)
train_data = dataset[:split_index]
test_data = dataset[split_index:]

An example from the dataset:
Example({'question': '  A Cruise autonomous vehicle (“AV”), operating in driverless autonomous mode, was traveling northbound on Pennsylvania Avenue\n between 23rd Street and 25th Street when the AV approached a double-parked semi-trailer truck in the AV’s lane of travel. The AV began\n to maneuver around and to the left of the semi-trailer truck and came to a stop for a stop sign at the intersection with 23rd Street. At the\n same time, the semi-trailer truck proceeded forward and, shortly thereafter, made contact with the AV. This caused damage to the front\n passenger side bumper fascia of the AV. The parties exchanged information and there were no reported injuries.\n\n', 'answer': 'param map = localPath(\'../../../assets/maps/CARLA/Town01.xodr\')\nparam carla_map = \'Town01\'\nmodel scenic.simulators.carla.model\n\nAV_MODEL = "vehicle.lincoln.mkz_2017"\nTRUCK_MODEL = "vehicle.volvo.xc90"\nSPEED = 10\nAV_BRAKING_THRESHOLD = 5\nCONTACT_THRESHOLD = 2\n\nb

In [57]:
class GenScenic(dspy.Signature):
    """
    Write Scenic probablistic programs given a natural language description 
    of an autonomous vehicle crash report.
    """
    question = dspy.InputField()
    param_definitions = dspy.OutputField(desc='Necessary imports and parameter definitions')
    scene_setup = dspy.OutputField(desc='Objects that should be placed in the scene')
    behaviors = dspy.OutputField(desc='Dynamics of the objects in the scene')
    placement = dspy.OutputField(desc='Placement of the dynamic objects in the scene')
    post_conditions = dspy.OutputField(desc='Requires and Terminates clauses')

In [37]:
generate_ans = dspy.Predict(GenScenic)

pred = generate_ans(question=test_data[0].question)

print(f"Question:\n {test_data[0].question}")
print(f"Predicted Answer:\n {pred}")

turbo.inspect_history(n=1)

Question:
   SCENARIO DESCRIPTION
 A Cruise autonomous vehicle ("Cruise AV"), operating in autonomous mode was turning from Southbound on Folsom Street to Westbound
 on 15th Street. When the Cruise AV stopped to yield for an oncoming vehicle, a Chevrolet Silverado that was traveling behind made contact
 with the rear bumper of the Cruise AV. This caused damage to the rear bumper of the Cruise AV. At the time of the incident, the driver of
 the Cruise AV reported a minor injury, but emergency services were not called.


Predicted Answer:
 Prediction(
    param_definitions='Necessary imports and parameter definitions:\n- `Cruise AV`: The autonomous vehicle operated by Cruise.\n- `Chevrolet Silverado`: The vehicle that made contact with the Cruise AV.\n- `Southbound on Folsom Street`: The direction in which the Cruise AV was traveling before turning.\n- `Westbound on 15th Street`: The direction in which the Cruise AV was turning.\n- `rear bumper of the Cruise AV`: The specific part of the

In [58]:
class GenScenic(dspy.Signature):
    """
    Write Scenic probablistic programs given a natural language description 
    of an autonomous vehicle crash report.
    """
    context = dspy.InputField(desc="may contain relevant information on how to construct a scenic program")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Scenic program")

class RAG(dspy.Module):
    def __init__(self, num_passages : int = 3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenScenic)

    def forward(self, question : str) -> str:
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [93]:
from dspy.teleprompt import BootstrapFewShot, BootstrapFinetune
from dspy.evaluate import answer_exact_match, auto_evaluation, answer_passage_match

# Validation logic: check that the generated program is valid
def validate_context_and_answer(example, prediction, trace=None):
    # print(f"Question:\n {example}")
    # print(f"Predicted Answer:\n {prediction}")
    return answer_exact_match(example, prediction)

few_show_teleprompt = BootstrapFewShot(metric=validate_context_and_answer)

# compiled_few_shot = few_show_teleprompt.compile(RAG(), trainset=train_data)

pred = compiled_few_shot(test_data[0].question)
print(f"Question:\n {test_data[0].question}")
print(f"Predicted Answer using Few Shot and RAG:\n {pred}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

output_dir = '../outputs/dspy_rag_few_shot'
os.makedirs(output_dir, exist_ok=True)

def clean_str(s : str) -> str:
    s = s.replace('\\n', '\n')
    s = s.replace('\\t', '\t')
    return s

for i, example in enumerate(test_data):
    pred = compiled_few_shot(example.question)
    with open(os.path.join(output_dir, f'example_{i}.txt'), 'w') as f:
        f.write(f"Question:\n {clean_str(example.question)}\n")
        f.write(f"Predicted Answer using Few Shot and RAG:\n {clean_str(pred.answer)}\n")
        f.write(f"Retrieved Contexts (truncated): {[clean_str(c[:200]) + '...' for c in pred.context]}\n")

for name, parameter in compiled_few_shot.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()

Question:
   A Cruise autonomous vehicle (“Cruise AV”), operating in autonomous mode, was traveling on southeast bound 10th Street between Market
 and Jessie Streets when another vehicle changing into the same lane made contact with the left rear corner of the Cruise AV, damaging the
 Cruise AV’s left rear bumper and wheel well. There were no injuries and police were not called.


Predicted Answer using Few Shot and RAG:
 Prediction(
    answer="param map = localPath('../../../assets/maps/CARLA/Town01.xodr') param carla_map = 'Town01' model scenic.simulators.carla.model EGO_SPEED = 10 LANE_CHANGE_SPEED = 8 OTHER_VEHICLE_SPEED = 12 behavior CruiseAVBehavior(): try: do FollowLaneBehavior(EGO_SPEED) interrupt when withinDistanceToAnyObjs(self, 5): do LaneChangeBehavior(target_speed=LANE_CHANGE_SPEED) behavior OtherVehicleBehavior"
)
generate_answer
Example({'question': '  On April 12, 2023 at 5:29 PM PST a Waymo Autonomous Vehicle (“Waymo AV”) operating in Santa Monica, California was in 

In [85]:
for file in os.listdir(output_dir):
    with open(os.path.join(output_dir, file), 'r') as f:
        text = f.read()
    with open(os.path.join(output_dir, file), 'w') as f:
        f.write(text.replace('\\n', '\n'))

In [94]:
finetune_teleprompt = BootstrapFinetune(metric=validate_context_and_answer)

compiled_finetune = finetune_teleprompt.compile(RAG(), trainset=train_data)

pred = compiled_finetune(test_data[0].question)
print(f"Question:\n {test_data[0].question}")
print(f"Predicted Answer using Finetune and RAG:\n {pred}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

output_dir = '../outputs/dspy_rag_finetune'
os.makedirs(output_dir, exist_ok=True)

for i, example in enumerate(test_data):
    pred = compiled_finetune(example.question)
    with open(os.path.join(output_dir, f'example_{i}.txt'), 'w') as f:
        f.write(f"Question:\n {clean_str(example.question)}\n")
        f.write(f"Predicted Answer using Few Shot and Finetuning:\n {clean_str(pred.answer)}\n")
        f.write(f"Retrieved Contexts (truncated): {[clean_str(c[:200]) + '...' for c in pred.context]}\n")

for name, parameter in compiled_finetune.named_predictors():
    print(name)
    print(parameter.demos[0])
    print()



  1%|▏         | 2/158 [00:06<08:54,  3.42s/it]


KeyboardInterrupt: 

In [None]:
generate_ans_with_chain_of_thought = dspy.ChainOfThought(GenScenic)

pred = generate_ans_with_chain_of_thought(question=test_data[0].question)

print(f"Question:\n {test_data[0].question}")
print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
print(f"Predicted Answer:\n {pred}")