# comparing pre- vs post- textgrad optimization

we promped gpt-4o with papers shorter that 20 pages, and ask to return this yaml:
```yaml
single_article: True/False
topic_photonic: True/False
components_list:
- a 1x1 modulator with MHz speed
- a 1x2 component ...
...
circuit_complete: True/False
```



table of summary:
```
all papers          374
processed papers    321

                                            pre         post
valid yaml                                  312         308

single_article & topic_photonic             303         302
circuit_complete                            262         217

total photonic components                   1206        1052
valid photonic components                   79%         82%

```

---> Human


In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
import pandas as pd
from dotenv import load_dotenv

load_dotenv(dotenv_path="../../.env")
import yaml

from PhotonicsAI.Photon import llm_api

df = pd.read_parquet("db/AMF_papers.parquet")

n = 0
m = 0
i = 0
pre_nodes = {}
for idx, row in df.iterrows():
    m += 1
    if pd.notna(row["TextGrad_Nodes_preOptimize"]):
        n += 1
        try:
            nodes_ = row["TextGrad_Nodes_preOptimize"].strip("```yaml").strip("```")
            pre_nodes[idx] = yaml.safe_load(nodes_)
        except:
            i += 1

j = 0
post_nodes = {}
for idx, row in df.iterrows():
    if pd.notna(row["TextGrad_Nodes_postOptimize"]):
        try:
            nodes_ = row["TextGrad_Nodes_postOptimize"].strip("```yaml").strip("```")
            post_nodes[idx] = yaml.safe_load(nodes_)
        except:
            j += 1

print("all amf papers", m)
print("amf papers processed by textgrad (shorter than 20 pages)", n)

print("PRE: failed to parse component lists", i)
print("POST: failed to parse component lists", j)

In [None]:
print("counting the booleans:")


c = 0
for i, k in pre_nodes.items():
    # if k['single_article'] & k['topic_photonic']:
    if k["circuit_complete"]:
        # if k['single_article'] & k['topic_photonic'] & k['circuit_complete']:
        c += 1
print("pre", c)


c = 0
for i, k in post_nodes.items():
    # if k['single_article'] & k['topic_photonic']:
    if k["circuit_complete"]:
        # if k['single_article'] & k['topic_photonic'] & k['circuit_complete']:
        c += 1
print("post", c)

In [None]:
from pydantic import BaseModel
from rich.progress import track

print("counting valid photonic compoenents")


class ComponentsResponse(BaseModel):
    valid_photonic_components: bool


sys_prompt1 = """Is this yaml list of one or many strings.
Each item in the list should describe an integrated photonic component.
It should NOT be a electronic device or component.
If correct, answer with valid_photonic_components.
"""
# It should NOT be a photonic device or component off the chip.

c = 0
cc = 0
for k, v in track(pre_nodes.items()):
    if (
        pre_nodes[k]["single_article"]
        & pre_nodes[k]["topic_photonic"]
        & pre_nodes[k]["circuit_complete"]
    ):
        j = yaml.dump(v["components_list"])
        r = llm_api.callgpt_pydantic(j, sys_prompt1, ComponentsResponse)
        pre_nodes[k]["valid_photonic_components"] = r.valid_photonic_components
        if r.valid_photonic_components:
            c += len(v["components_list"])
        else:
            print(j)
            print("---------------------")
        cc += len(v["components_list"])

print("PRE: total photonic components:", cc)
print("PRE: valid photonic components:", c)

In [None]:
cc = 0
c = 0

for k, v in track(post_nodes.items()):
    if (
        post_nodes[k]["single_article"]
        & post_nodes[k]["topic_photonic"]
        & post_nodes[k]["circuit_complete"]
    ):
        j = yaml.dump(v["components_list"])
        r = llm_api.callgpt_pydantic(j, sys_prompt1, ComponentsResponse)
        post_nodes[k]["valid_photonic_components"] = r.valid_photonic_components
        if r.valid_photonic_components:
            c += len(v["components_list"])
        else:
            print(j)
            print("---------------------")
        cc += len(v["components_list"])

print("POST: total photonic components:", cc)
print("POST: valid photonic components:", c)

In [None]:
from langchain_openai import OpenAIEmbeddings

all_pre_nodes = []
for k, v in pre_nodes.items():
    if (
        pre_nodes[k]["single_article"]
        & pre_nodes[k]["topic_photonic"]
        & pre_nodes[k]["circuit_complete"]
    ):
        for i in v["components_list"]:
            all_pre_nodes.append(str(i))

all_post_nodes = []
for k, v in post_nodes.items():
    if (
        post_nodes[k]["single_article"]
        & post_nodes[k]["topic_photonic"]
        & post_nodes[k]["circuit_complete"]
    ):
        for i in v["components_list"]:
            all_post_nodes.append(str(i))


embeddings_model = OpenAIEmbeddings()

all_pre_nodes_embeddings = embeddings_model.embed_documents(all_pre_nodes)
all_post_nodes_embeddings = embeddings_model.embed_documents(all_post_nodes)

print(len(all_post_nodes_embeddings))
print(all_post_nodes_embeddings[0])

In [None]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(all_post_nodes_embeddings)
cluster_assignment = clustering_model.labels_

post_clustered = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in post_clustered:
        post_clustered[cluster_id] = []

    post_clustered[cluster_id].append(all_post_nodes[sentence_id])

for i, cluster in post_clustered.items():
    print("==========================", i)
    for k in cluster:
        print(k)
    print("==========================")

print("===============")
print(len(post_clustered))

In [None]:
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(all_pre_nodes_embeddings)
cluster_assignment = clustering_model.labels_

pre_clustered = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in pre_clustered:
        pre_clustered[cluster_id] = []

    pre_clustered[cluster_id].append(all_pre_nodes[sentence_id])

for i, cluster in pre_clustered.items():
    print("==========================", i)
    for k in cluster:
        print(k)
    print("==========================")

print("===============")
print(len(pre_clustered))