In [None]:
import os
import openai
from tqdm import tqdm
import pandas as pd
import numpy as np

from IPython import display

cwd = os.getcwd()

## Main GPT Functions

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
openai.api_key = OPENAI_API_KEY

def run_gpt(messages, this_model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=this_model,
        messages=messages
    )
    return response


def gpt_oneshot(input_prompt, directive="You are a helpful assistant.", verbose=False):
    message_hist = [{"role": "system", "content": directive},  # add directed
                    {"role": "user", "content": input_prompt}]  # init
    response = run_gpt(message_hist)["choices"][0]["message"]["content"]
    if verbose:
        print("chat_gpt: ", response, '\n')
#     message_hist.append({"role": "system", "content": response})
    return response

In [None]:
# https://platform.openai.com/docs/guides/vision
def gpt_image_oneshot(image_link, new_prompt="Please respond with a question for which the answer is the code snippet on this page.",
                        verbose=False):
    # "This image contains a block of Verilog code and text relating to it. "+...
    new_content = [{"type": "text", "text": "This image contains some information about Verilog. "+new_prompt},
                    {"type": "image_url", "image_url": {"url": image_link}}]
    message_hist = [{"role": "system", "content": "You are a helpful assistant that gives information on images of code."},
                    {"role": "user", "content": new_content}]  # init
    if verbose:
        print("Asked:", new_content[0]["text"])
#         print("With Image:", new_content[1]["image_url"]["url"])
    response = run_gpt(message_hist, "gpt-4-vision-preview")["choices"][0]["message"]["content"]
    if verbose: print("\nResponded With:", response)
    return response
# TODO account for timout issue
# TODO account for copyright

# test_image1 = cwd+"/testpage.jpg"
# base64_image = encode_image(image_path)
# my_link3 = f"data:image/jpeg;base64,{base64_image}"
# rep = gpt_image_oneshot(my_link3, "Please respond with a question for which the answer is the code snippet on this page.", True)

# print()
# for code_snip in get_code(rep):
#     print(code_snip)
#     print()

# display.Image(file_dir)

### Cleaning Functions

In [None]:
def get_failed_request(df, identifier="I'm sorry", setna=True, verbose=True):
    inds = set()
    messages = set()
    failed_count = 0
    for i, row in df.iterrows():
        for col in df.columns:
            if identifier in str(row[col]):
                inds.add(i)
                messages.add(row[col])
                failed_count += 1
                if setna:
                    df.loc[i, col] = np.nan
    if verbose:
        copy_count = sum([1 for mes in messages if "copyright" in mes])
        print("found", len(inds), "rows with bad requests and a total of", failed_count, "cells of bad calls")
        print(len(messages), "of which where unique and", copy_count, "of which were identified as copyright related")
        if len(messages) > 0:
            print("\nan example of this is:\n", list(messages)[0])
    return df, failed_count, messages, inds

### Helpers

In [None]:
# helpers
import base64

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def get_code(text):
    if text is None or len(text) == 0:
        return []
    this_code = []
    for i, code_chunk in enumerate(text.split('```')):  # pieces of code are denoted by ``` so we split
        if i%2 == 1:  # every other chunk is a piece of code in this case
            this_code.append(code_chunk[7:])  # Note: code declarations also have 'python\n' denoting the language, since we dont need this, we omit the first 7 chars
    return this_code


## Code Extraction Functions

In [None]:
bad_message = "NO CODE"
ask_code = "If there is code on this page the please write it. Otherwise say \""+bad_message+"\""
ask_caption = "This image also contains some Verilog code in it. Give a short caption associated with this code."

indicators = ["```"]
def has_code(message):
    message = message.strip()
    if bad_message in message:
        return False  # instant signal
    for ind in indicators:
        if ind in message:
            return True
    return False

def extract_code(image_path, atmps=3, verbose=False):
    """ takes in the link of a local image and gets information about code on it """
    base64_image = encode_image(image_path)
    image_link = f"data:image/jpeg;base64,{base64_image}"
    
    attempts = 0
    raw_code = ""  # get gpt to extract code
    while attempts < atmps:
        try:
            raw_code = gpt_image_oneshot(image_link, ask_code)
            attempts = atmps
        except Exception as e:
            attempts += 1
            if verbose: print("Encountered error:", e)
    # if there was some output we save it
    if raw_code == "":
        print("*couldnt get code after", atmps, "attempts")
    else:
        processed_code = ""
        caption = None
        is_code = has_code(raw_code)
        if is_code:
            processed_code = '\n'.join(get_code(raw_code))
            attempts = 0
            while attempts < atmps:
                try:
                    caption = gpt_image_oneshot(image_link, ask_caption)
                    attempts = atmps
                except Exception as e:
                    attempts += 1
                    if verbose: print("Encountered error:", e)
        has_finished = is_code and (caption != "")  # signifier that we have consistent output on both fronts
        return [image_path, has_finished, raw_code, processed_code, caption]

### Pipeline

In [88]:
# find file directories
data_dir = "Data/example-code/verilog examples/Textbook SVA/Book-Verilog/paginated/"
ims = [data_dir+file for file in os.listdir(data_dir) if 'jpg' in file]
print("Found", len(ims), "files\n")

# process them
results = []
for im in tqdm(ims[3637:]):  # not last left off at 3637
    results.append(extract_code(im, 5))

Found 4822 files



100%|██████████| 1185/1185 [2:19:22<00:00,  7.06s/it] 


### Cleaning & Saving

In [89]:
df = pd.DataFrame(results, columns=["image_directory", "isComplete", "raw_code", "code", "caption"])
print("Found", len(df[df["isComplete"]==True]), "/", len(df), "complete rows")
df["hasAssert"] = df["code"].apply(lambda x: True if 'assert' in str(x).lower() else False)
print("Found", len(df[df["hasAssert"]==True]), "/", len(df), "asserted rows")
df

Found 648 / 1185 complete rows
Found 193 / 1185 asserted rows


Unnamed: 0,image_directory,isComplete,raw_code,code,caption,hasAssert
0,Data/example-code/verilog examples/Textbook SV...,True,The image contains the following Verilog/Syste...,\nclass driver;\n virtual arb_ifc arb; ...,"""Example of a SystemVerilog class with a virtu...",False
1,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False
2,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False
3,Data/example-code/verilog examples/Textbook SV...,True,The image contains Verilog code snippets. Here...,\nlogic signed [11:0] a;\na = 12'shFF; // GOTC...,"""Common pitfalls in Verilog related to signed ...",False
4,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False
...,...,...,...,...,...,...
1180,Data/example-code/verilog examples/Textbook SV...,True,There is a snippet of Verilog code in the imag...,\nalways @(posedge clock)\n $read_test_vect...,Verilog PLI: Utilizing the `misctf` routine fo...,False
1181,Data/example-code/verilog examples/Textbook SV...,True,The image contains two sections of Verilog cod...,\nmodule m_illegal_disable_nesting(logic reset...,"""Example of Verilog code demonstrating illegal...",True
1182,Data/example-code/verilog examples/Textbook SV...,True,```verilog\nalways @(s1global_clock)\n o <=...,\nalways @(s1global_clock)\n o <= a;\nendmo...,"Caption: ""Excerpt from a book discussing Veril...",False
1183,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False


In [90]:
# df["raw_code"] = df["raw_code"].apply(lambda x: np.nan if x.strip()==bad_message else x)
df, failed_count, messages, inds = get_failed_request(df)
# print("\n")
# df, failed_count2, messages2, inds2 = get_failed_request(df, bad_message)

df.isna().sum()
# print(df.iloc[0]["code"])

found 0 rows with bad requests and a total of 0 cells of bad calls
0 of which where unique and 0 of which were identified as copyright related


image_directory      0
isComplete           0
raw_code             0
code                 0
caption            537
hasAssert            0
dtype: int64

In [91]:
save_dir = "Data/example-code/verilog examples/fulltextbook-gptresponse2.csv"
df.to_csv(save_dir, index=False)
print("SAVED TO", save_dir)

SAVED TO Data/example-code/verilog examples/fulltextbook-gptresponse2.csv


In [106]:
df = pd.read_csv("Data/example-code/verilog examples/fulltextbook-gptresponse.csv")
df

Unnamed: 0,image_directory,isComplete,raw_code,code,caption,hasAssert
0,Data/example-code/verilog examples/Textbook SV...,True,"The image contains Verilog code, which I'll wr...","\nimport ""DPI-C"" function chandle counter7_new...",Verilog code example for testing a 7-bit count...,False
1,Data/example-code/verilog examples/Textbook SV...,True,"The image contains a snippet of Verilog code, ...",\nproperty abc;\n @(posedge clk) a |=> b[=2:5...,"""Explaining Verilog Non-consecutive Repetition...",False
2,Data/example-code/verilog examples/Textbook SV...,True,The image contains code written in the Verilog...,\nfunction void Environment::build();\n // ...,"Caption: ""Example of Verilog code for building...",False
3,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False
4,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False
...,...,...,...,...,...,...
4817,Data/example-code/verilog examples/Textbook SV...,True,There is a snippet of Verilog code in the imag...,\nalways @(posedge clock)\n $read_test_vect...,Verilog PLI: Utilizing the `misctf` routine fo...,False
4818,Data/example-code/verilog examples/Textbook SV...,True,The image contains two sections of Verilog cod...,\nmodule m_illegal_disable_nesting(logic reset...,"""Example of Verilog code demonstrating illegal...",True
4819,Data/example-code/verilog examples/Textbook SV...,True,```verilog\nalways @(s1global_clock)\n o <=...,\nalways @(s1global_clock)\n o <= a;\nendmo...,"Caption: ""Excerpt from a book discussing Veril...",False
4820,Data/example-code/verilog examples/Textbook SV...,False,NO CODE,,,False


In [112]:
tester = df.sample().iloc[0]
print("Caption:\n", tester["caption"], "\nCode:\n", tester["code"])

Caption:
 Caption: "Examples of formal and actual argument specification in Verilog, demonstrating default value assignment and position-based connections." 
Code:
 
logic req, gnt;

property pr1 (enb=1'b1, logic pa, logic pb);
    @(posedge clk) enb |-> pa ##2 pb;
endproperty

reqGnt: assert property (pr1(cStart,req,gnt));


reqGnt: assert property ( pr1 ( .pa(req), .pb(gnt) ));


reqGnt: assert property ( pr1 ( , .req,gnt));


property pr1 (int dSize, csig, enb=1'b1, logic pa, logic pb);

logic [dSize:0] Ldata;

@(csig, Ldata=data) enb |-> pa ##2 pb;

endproperty

reqGnt: assert property (pr1('d31,posedge clk,cStart, req, gnt));



In [108]:
save_dir = "Data/example-code/verilog examples/fulltextbook-gptresponse.csv"
df.to_csv(save_dir, index=False)
print("SAVED TO", save_dir)

SAVED TO Data/example-code/verilog examples/fulltextbook-gptresponse.csv


## Supervised Query Function

In [None]:
# oneshot code
image_to_code = "Please respond with only the code present in this image."
image_to_question = "Please respond with a question for which the answer is the code snippet on this page."
image_to_figure = "Respond with the figure associated with the code present in this image."
code_to_description = "You are a helpful assistant that describes pieces of code."

def info_from_image(image_path, atmps=3, verbose=False, ext='jpeg'):
    """ takes in the link of a local image and gets information about code on it """
    if ext == "jpeg":
        base64_image = encode_image(image_path)
        image_link = f"data:image/{ext};base64,{base64_image}"
    elif ext == "pdf":  # https://stackoverflow.com/questions/3715493/encoding-an-image-file-with-base64
        with open(image_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read())
            image_link = f"data:image/{ext};base64,{base64_image}"
    errors = []
    
    attempts = 0
    raw_code = None  # get gpt to extract code
    while attempts < atmps:
        try:
            raw_code = gpt_image_oneshot(image_link, image_to_code)
            attempts = atmps
        except Exception as e:
            attempts += 1
            errors.append(str(e))
    if raw_code == None:
        print("couldnt get code after", atmps, "attempts")
        print(errors[-1])
    processed_code = "" if raw_code == None else '\n'.join(get_code(raw_code))
    
    attempts = 0
    figure = None  # get gpt to ask a question relating to the code
    while attempts < atmps:
        try:
            figure = gpt_image_oneshot(image_link, image_to_figure)
            attempts = atmps
        except Exception as e:
            attempts += 1
            errors.append(str(e))
    if figure == None:
        print("couldnt get figure after", atmps, "attempts")
        print(errors[-1])
    
    attempts = 0
    question = None  # get gpt to ask a question relating to the code
    while attempts < atmps:
        try:
            question = gpt_image_oneshot(image_link, image_to_question)
            attempts = atmps
        except Exception as e:
            attempts += 1
            errors.append(str(e))
    if question == None:
        print("couldnt get question after", atmps, "attempts")
        print(errors[-1])
    
    attempts = 0
    description = None  # get gpt to describe the code
    if raw_code != None:
        desc_prompt = "Write a short description for what this piece of Verilog code does:\n" + processed_code
        while attempts < atmps:
            try:
                description = gpt_oneshot(desc_prompt, code_to_description)
                attempts = atmps
            except Exception as e:
                attempts += 1
                errors.append(str(e))
        if description == None:
            print("couldnt get description after", atmps, "attempts")
            print(errors[-1])
    
    return [image_path, raw_code, processed_code, figure, question, description]

### Small Test Case

tester_im = cwd+"/testpage.jpg"
results = info_from_image(tester_im, 10, True)
display.Image(tester_im)

In [None]:
print(results[4])
print()
print(results[5])
print()
print(results[3])
print()
print(results[2])

## Running Analysis of all Pages

In [None]:
# MANUAL SPLITTER: https://pdfcandy.com/blog/how-to-split-pdf-on-mac.html
# MANUAL CONVERTER: https://png2jpg.com/

# find files
book = "LogicDesignAndVerification-Thomas"
# images_dir = "/Data/example-code/verilog examples/Textbook SVA/" + book
images_dir = cwd+"/Data/example-code/verilog examples/pages"
all_images = [images_dir+"/"+file for file in os.listdir(images_dir) if ".jpg" in file]
print("loaded", len(all_images), "jpg images")

# TODO: finalize auto-split & convert
# from pdf2image import convert_from_path
# pages_dir = images_dir+"/pages"
# all_pages = [cwd+pages_dir+"/"+file for file in os.listdir(cwd+pages_dir) if ".pdf" in file]
# page_count = 0
# def pdf_to_jpegs(imdir, verbose=True):
#     """ convert from multi-page pdf to single-page jpeg """
    
#     if verbose: print("...converting", imdir)
#     # https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
#     pages = convert_from_path(imdir)  # TODO: fix
#     for page in pages:
#         newpage_dir = pages_dir + "/" + book + "-p" + str(page_count) + ".jpg"
#         page.save(newpage_dir, 'JPEG')
#         all_pages.append(newpage_dir)
#         page_count += 1
        
# for im in all_images:

In [None]:
all_results = []
for im in tqdm(all_images):
    results = info_from_image(im, 10, True)
    all_results.append(results)

### Cleaning Results

In [None]:
df = pd.DataFrame(all_results, columns=["Path", "raw_code", "Code", "Figure", "Question", "Description"])

# recover author
def get_origin(im):
    if "creenshot" in im:
        return "Mehta"
    else:
        return "Thomas" 
df["Book"] = df["Path"].apply(lambda x: get_origin(x))  

# cleaning
df, failed_count, messages, inds = get_failed_request(df)
df.isna().sum()

In [None]:
df

### Saving Results

In [None]:
save_dir = "Data/example-code/verilog examples/pages-gptresponse.csv"
df.to_csv(save_dir)
print("SAVED TO", save_dir)

# df = pd.read_csv(save_dir)

In [None]:
# Future Work Notes

# TODO: use this as a baseline for future examples
# TODO: assertion-specific dataset and 
# TODO: later, then a more general datas
# raw code datasets --> snippers can be wrong and have bugs - llms are fine-tuned on prompts and responses
# creation of the first "high-quality" prompt-response dataset with sva --> 

# RAW PYTHON, RAW SVA, SUPERVISED SVA (from textbooks)
# Note; should be qualitative analysis with examples --> primarily this is the dataset, the crux is that this is one benefit of using the dataset
# conclude with some basic evaluation of the usefulness of the dataset

## Short Analysis

In [None]:
df = pd.read_csv("Data/example-code/verilog examples/pages-gptresponse.csv")
df.head()

In [None]:
ind = 3
print(df.iloc[ind]["Code"])

In [None]:
display.Image(df.iloc[ind]["Path"])