In [None]:
import sys

import pandas as pd
import yaml
from dotenv import load_dotenv

from PhotonicsAI.Photon import llm_api, utils

load_dotenv(dotenv_path="../../.env")
df = pd.read_parquet("db/AMF_papers.parquet")
df["PaperEntities1"] = None

for idx, row in df.iterrows():
    if row["N_pages"] < 20:
        print(idx, "===============")

        article = df.loc[idx]["text_full"]

        r = llm_api.papers_entity_extraction(article)
        df.at[idx, "PaperEntities1"] = r

In [None]:
df.info()

df.to_parquet("db/AMF_papers.parquet")

# compare 4o to o1-preview and o1-mini

In [None]:
load_dotenv(dotenv_path="../../.env")
df = pd.read_parquet("db/AMF_papers.parquet")

idx = 223
article = df.loc[idx]["text_full"]
print("--> filename: ", df.loc[idx]["filename"])

prompt = f"""Is this a single academic article, and not a dissertation or collection of papers (single_article)?
Is the main topic of this article about integrated photonic circuits (topic_photonic)?
If yes, find the photonic components that are used on the chip.
Return a concise list of these photonic components, if any (components_list).
For each component, try to extract: brief spec,
and the number of optical input (N) and output (M) ports denoted by NxM, e.g. 1x2.
Do not parse specifications and descriptive modifiers of a component as separate components.
Finally, is there an enough information to understand and desrcibe how the on-chip components
are interconnected to form the photonic circuit (circuit_complete)?
Answer in YAML following the template:
single_article: True/False
topic_photonic: True/False
components_list:
  - a 1x1 modulator with MHz speed
  - a 1x2 component ...
  ...
circuit_complete: True/False

INPUT_ARTICLE:
{article}
"""


print("------------------- 4o: ")
r = llm_api.call_openai(prompt, model="gpt-4o")
print(r)
print("------------------- o1-preview: ")
r = llm_api.call_openai_reasoning(prompt, model="o1-preview")
print(r)
print("------------------- o1-mini: ")
r = llm_api.call_openai_reasoning(prompt, model="o1-mini")
print(r)

# Search PDK for components

In [None]:
sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
import pandas as pd
from dotenv import load_dotenv

load_dotenv(dotenv_path="../../.env")
from PhotonicsAI.Photon import llm_api

with open("../../Photon/templates.yaml") as file:
    templates_dict = yaml.safe_load(file)
    templates_str = yaml.dump(templates_dict, default_flow_style=False)

# adding components to templates_dict:
db_docs = utils.search_directory_for_docstrings("../../KnowledgeBase/DesignLibrary")
for i in db_docs:
    templates_dict[i["module_name"]] = i["docstring"]

####################################

df = pd.read_parquet("db/AMF_papers.parquet")

df["PaperEntities1_retrieved_components"] = None

for idx, row in df.iterrows():
    if pd.notna(row["PaperEntities1"]):
        if (
            row["PaperEntities1"]["single_article"]
            and row["PaperEntities1"]["topic_photonic"]
            and row["PaperEntities1"]["circuit_complete"]
        ):
            retrieved_components = []
            for c in row["PaperEntities1"]["components_list"]:
                r = llm_api.llm_search(c, list(templates_dict.values()))
                retrieved_components.append(r.dict())

            df.at[idx, "PaperEntities1_retrieved_components"] = retrieved_components

            print(idx, "===============")
            print(df.loc[idx]["PaperEntities1_retrieved_components"])

        df.to_parquet("db/AMF_papers.parquet")

# counting how many papers have exact matches

In [None]:
import sys

import pandas as pd
from dotenv import load_dotenv

load_dotenv(dotenv_path="../../.env")
import yaml

from PhotonicsAI.Photon import llm_api, utils

with open("../../Photon/templates.yaml") as file:
    templates_dict = yaml.safe_load(file)
    templates_str = yaml.dump(templates_dict, default_flow_style=False)

# adding components to templates_dict:
db_docs = utils.search_directory_for_docstrings("../../KnowledgeBase/DesignLibrary")
for i in db_docs:
    templates_dict[i["module_name"]] = i["docstring"]

templates_keys = list(templates_dict.keys())


df = pd.read_parquet("db/AMF_papers.parquet")
# print(df.loc[80])

exact_match_counts = 0
partial_match_counts = 0
poor_match_counts = 0
for idx, row in df.iterrows():
    if row["PaperEntities1_retrieved_components"] is not None:
        scores = [
            comp["match_scores"][0]
            for comp in row["PaperEntities1_retrieved_components"]
        ]
        comps_idx = [
            comp["match_list"][0] for comp in row["PaperEntities1_retrieved_components"]
        ]
        # print(scores)
        if all(item in ["exact"] for item in scores):
            exact_match_counts += 1
        if all(item in ["exact", "partial"] for item in scores):
            partial_match_counts += 1
        if all(item in ["exact", "partial", "poor"] for item in scores):
            poor_match_counts += 1

        # if any(item in ['poor'] for item in scores):
        #     print(scores)
        #     # print({templates_keys[i]: templates_dict[templates_keys[i]] for i in comps_idx})
        #     print({templates_keys[i] for i in comps_idx})
        #     print(row['PaperEntities1']['components_list'])
        #     print('=============')

        # if any(item in ['poor'] for item in scores):
        print()
        print("============================= idx:", idx)
        for k in range(len(row["PaperEntities1_retrieved_components"])):
            retrieved_items = row["PaperEntities1_retrieved_components"][k]
            comp_0 = [templates_keys[i] for i in retrieved_items["match_list"]][0]
            score_0 = retrieved_items["match_scores"][0]
            search_phrase = row["PaperEntities1"]["components_list"][k]

            if score_0 == "exact":
                print(f"{comp_0} ({score_0}) <-- {search_phrase}")

        # gufghgf

print("exact_match_counts", exact_match_counts)
print("partial_match_counts", partial_match_counts)
print("poor_match_counts", poor_match_counts)

In [None]:
c = """Waveguide: Incorporates loss due to propagation, phase shift, and time delay."""

for i in range(20):
    r = llm_api.llm_search(c, list(templates_dict.values()))

    print("scores: ", r.match_scores)
    print("components: ", [templates_keys[i] for i in r.match_list])
    print("-------------------")