In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
import pandas as pd
import textgrad as tg
from dotenv import load_dotenv

load_dotenv(dotenv_path="../../.env")
import yaml

llm_engine = tg.get_engine("gpt-4o")
tg.set_backward_engine(llm_engine)


df = pd.read_parquet("db/AMF_papers.parquet")
df["TextGrad_Nodes_preOptimize"] = None
df["TextGrad_Nodes_postOptimize"] = None

for idx, row in df.iterrows():
    if row["N_pages"] < 20:
        print(idx, "=======================================")

        article = df.loc[idx]["text_full"]

        problem_text = f"""Is this a single academic article, and not a dissertation or collection of papers (single_article)?
        Is the main topic of this article about integrated photonic circuits (topic_photonic)?
        If yes, find the photonic components that are used on the chip.
        Return a concise list of these photonic components, if any (components_list).
        For each component, try to extract: brief spec,
        and the number of optical input (N) and output (M) ports denoted by NxM, e.g. 1x2.
        Finally, is there an enough information to understand and desrcibe how the on-chip components
        are interconnected to form the photonic circuit (circuit_complete)?
        Answer in YAML following the template:
        single_article: True/False
        topic_photonic: True/False
        components_list:
        - a 1x1 modulator with MHz speed
        - a 1x2 component ...
        ...
        circuit_complete: True/False


        INPUT ARTICLE:
        {article}
        """

        problem = tg.Variable(
            problem_text, role_description="the parsing problem", requires_grad=False
        )

        model = tg.BlackboxLLM("gpt-4o")
        code = model(problem)

        df.at[idx, "TextGrad_Nodes_preOptimize"] = code.value

        code.set_role_description("The yaml code to optimize")
        code.requires_grad = True

        photonic_critic_prompt = """You are a smart language model expert in photonic integrated circuits.
        This YAML file should be an accurate summary of the photonic components presented in the input article.
        Evaluate components_list in YAML based on:
        - it should only represent photonic components on the chip, and not off the chip.
        - does it accurately represent the photonic components?
        - is any photonic component missing?
        - this should be only a list of photonic components.
        - does YAML follow the provided template?
        Also evaluate circuit_complete boolean:
        - is it correct? can you understand and describe the connection between items in components_list?
        You do not propose a new YAML file, only evaluate the existing YAML file critically and give very concise feedback."""
        #
        # TODO: check if output is a valid yaml
        #
        loss_system_prompt = tg.Variable(
            photonic_critic_prompt,
            requires_grad=False,
            role_description="system prompt to the loss function",
        )

        format_string = """Problem: {problem}\nCurrent YAML code: {code}"""
        fields = {"problem": None, "code": None}
        formatted_llm_call = tg.autograd.FormattedLLMCall(
            engine=llm_engine,
            format_string=format_string,
            fields=fields,
            system_prompt=loss_system_prompt,
        )

        def loss_fn(problem: tg.Variable, code: tg.Variable) -> tg.Variable:
            inputs = {"problem": problem, "code": code}

            return formatted_llm_call(
                inputs=inputs,
                response_role_description=f"evaluation of the {code.get_role_description()}",
            )

        loss = loss_fn(problem, code)
        loss.backward()
        optimizer = tg.TGD(parameters=[code])
        optimizer.step()

        print(code.value)
        df.at[idx, "TextGrad_Nodes_postOptimize"] = code.value

        # df.to_parquet('db/AMF_papers.parquet')

# Analyzing output of TextGrad

In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
import pandas as pd
import textgrad as tg
from dotenv import load_dotenv

load_dotenv(dotenv_path="../../.env")
from PhotonicsAI.Photon import llm_api, utils

df = pd.read_parquet("db/AMF_papers.parquet")
df.info()

i = 0

paper_nodes = {}
for idx, row in df.iterrows():
    if pd.notna(row["TextGrad_Nodes_postOptimize"]):
        # print(idx, '=======================================')
        nodes_ = row["TextGrad_Nodes_postOptimize"]
        nodes_ = nodes_.strip("```yaml").strip("```")
        try:
            nodes_ = yaml.safe_load(nodes_)
            paper_nodes[idx] = nodes_
        except:
            i += 1
        # osjdfojsdf

print("---------------------\n")
print("NOT yaml parsable: ", i)

In [None]:
with open("../../Photon/templates.yaml") as file:
    templates_dict = yaml.safe_load(file)
    templates_str = yaml.dump(templates_dict, default_flow_style=False)

# adding components to templates_dict:
db_docs = utils.search_directory_for_docstrings("../../KnowledgeBase/DesignLibrary")
for i in db_docs:
    templates_dict[i["module_name"]] = i["docstring"]

# searching PDK for components

In [None]:
templates_list = []
templates_id_list = []
for i, k in templates_dict.items():
    templates_list.append(k)
    templates_id_list.append(i)


# dict_keys(['single_article', 'topic_photonic', 'components_list', 'circuit_complete'])

c = 0
for i, k in paper_nodes.items():
    if k["single_article"] & k["topic_photonic"] & k["circuit_complete"]:
        c += 1
        print("index: ", i)
        paper_nodes[i]["match_list"] = []
        paper_nodes[i]["match_scores"] = []
        paper_nodes[i]["match_comment"] = []
        for comp in k["components_list"]:
            comp = str(comp)
            # result = query_pipeline.run({"text_embedder": {"text": comp}})
            # print( f"{comp} -----> {result['retriever']['documents'][0].id}" )
            r_llm = llm_api.llm_search(comp, templates_list)
            print(f"{comp} -----> {r_llm}")

            paper_nodes[i]["match_list"].append(r_llm.match_list)
            paper_nodes[i]["match_scores"].append(r_llm.match_scores)
            paper_nodes[i]["match_comment"].append(r_llm.match_comment)
            # print(type(comp), re.sub(r"[\'{}]", "", comp))
        print("=======================")

        # sdfsd
    # print(k.keys())

print("papers with searched components: ", c)

In [None]:
# print(templates_id_list[13])
# print(templates_id_list[6])
# print(templates_id_list[7])
# print(templates_id_list[28])
print(df.loc[80])

In [None]:
c = 0
for i, k in paper_nodes.items():
    if k["single_article"] & k["topic_photonic"] & k["circuit_complete"]:
        for comp in k["components_list"]:
            c += 1
            print(comp)
print(c)

In [None]:
idx_complete_comp = []

c = 0
for idx, k in paper_nodes.items():
    if k["single_article"] & k["topic_photonic"] & k["circuit_complete"]:
        all_components_present = True
        for ll in k["components_matched"]:
            if len(ll) == 0:
                all_components_present = False
        if all_components_present:
            idx_complete_comp.append(idx)
        c += all_components_present
        # print(comp)
print(c)

for i in idx_complete_comp:
    try:
        for ii in range(len(paper_nodes[i]["components_list"])):
            print(paper_nodes[i]["components_list"][ii])
            print(paper_nodes[i]["components_matched"][ii])
        print("===========================")
    except:
        pass

In [None]:
import pickle

with open("db/AMF_nodes_postTextGrad.pkl", "wb") as f:
    pickle.dump(paper_nodes, f)

In [None]:
import pickle

with open("db/AMF_nodes_postTextGrad.pkl", "rb") as f:
    paper_nodes = pickle.load(f)