# AMF papers

In [None]:
import os

import fitz  # PyMuPDF
import pandas as pd

directory = "db/AMF/"

filenames = []
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filenames.append({"filename": filename})
df = pd.DataFrame(filenames)


df["header_text"] = ""
df["N_pages"] = 0
df["full_text"] = ""
for index, row in df.iterrows():
    filepath = os.path.join(directory, row["filename"])

    doc = fitz.open(filepath)
    text = ""
    full_text = ""

    total_pages = len(doc)
    df.at[index, "N_pages"] = total_pages

    header_pages = 3
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        page_text = page.get_text()
        full_text += page_text

        # Store text from the first `n_pages` in header_text
        if page_num < header_pages:
            text += page_text

    df.at[index, "header_text"] = text
    df.at[index, "full_text"] = full_text

    doc.close()

print(df.info())

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(df["N_pages"], bins=range(1, df["N_pages"].max() + 2))
plt.xscale("log")
plt.xlabel("Number of Pages")
plt.ylabel("Frequency")

In [None]:
import sys

sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
from tqdm.notebook import tqdm

from PhotonicsAI.Photon import llm_api

# sys_prompt = 'Identify the main topics in this document.'
sys_prompt = "Identify and return the title of this document without modifications."

df["llm_title"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    r = llm_api.call_openai(
        row["header_text"], sys_prompt=sys_prompt, model="gpt-4o-mini"
    )
    df.at[index, "llm_title"] = r

In [None]:
sys_prompt = (
    "Classify this document into one of the following categories: [research_article, \
                review_article, dissertation, collection_of_articles, whitepaper_or_news_article, none]. \
                Only respond with one of the categories and no explanation."
)

df["llm_cat0"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    r = llm_api.call_openai(
        row["header_text"], sys_prompt=sys_prompt, model="gpt-4o-mini"
    )
    df.at[index, "llm_cat0"] = r

In [None]:
%matplotlib inline
cat0_counts = df["llm_cat0"].value_counts()
plt.figure(figsize=(4, 3))
ax = cat0_counts.plot(kind="barh")
plt.xlabel("Frequency")
for index, value in enumerate(cat0_counts):
    ax.annotate(
        str(value),
        xy=(value, index),
        xytext=(5, 0),
        textcoords="offset points",
        ha="left",
        va="center",
    )

In [None]:
sys_prompt = (
    "This is the first couple pages of a scientific article, in the topic of integrated photonics.\
                Extract the main devices/circuits/systems developed or built in this article.\
                Rerurn a list of these extracted labels, comma separated. You can only select a maximum of four labels.\
                Only respond with these labels and no other explanation."
)

df["labels"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    if row["llm_cat0"] == "research_article":
        r = llm_api.call_openai(
            row["header_text"], sys_prompt=sys_prompt, model="gpt-4o-mini"
        )
        df.at[index, "labels"] = r

# df.to_parquet('db/AMF_papers.parquet')

In [None]:
df_exploded = df["labels"].str.split(",").explode().str.strip()
label_counts = df_exploded.value_counts()
label_counts.head(15)
# label_counts.to_dict()

In [None]:
sys_prompt = (
    "This is beginning of a scientific article in the topic of integrated photonics.\
                Is it about photonic circuit? Circuits have multiple interconnected components working together in an integrated system.\
                Examples of a single component: a modulator or a detector. Circuit examples: a transceiver or a multiplexer.\
                If article is about a circuit, answer with the brief title of circuit. This should be a high level title without detail specifications.\
                If not, answer with NO.\
                Do not provide any other explanation."
)

df["circuit"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    if row["llm_cat0"] == "research_article":
        r = llm_api.call_openai(
            row["header_text"], sys_prompt=sys_prompt, model="gpt-4o-mini"
        )
        df.at[index, "circuit"] = r

# df.to_parquet('db/AMF_papers.parquet')

In [None]:
# df['circuit'][3]

df_exploded = df["circuit"].explode()
label_counts = df_exploded.value_counts()
# for index, value in label_counts.items():
#     print(value)
label_counts.head(20)
# df_exploded.unique()

In [None]:
sys_prompt = "This is a title of photonic circuit.\
                Remove any specifications/details and return the main function. \
                For example, [4×100 Gb/s DWDM Optical Transceiver] to [transceiver]; \
                [Polarization-Insensitive Multimode Antisymmetric Waveguide Bragg Grating Filter] to [Bragg Grating Filter].\
                Do not provide any explanation."

df["circuit_distilled"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    if (row["circuit"] != "") & (row["circuit"] != "NO"):
        r = llm_api.call_openai(
            row["full_text"], sys_prompt=sys_prompt, model="gpt-4o-mini"
        )
        df.at[index, "circuit_distilled"] = r

In [None]:
sys_prompt = """This is a scientific article about an integrated photonic circuit.
Based on the input article, extract the title of the implemented circuit,
the primitive components (these are on-chip components only),
the instructions how to assemble the components into the circuit layout,
and a short summary of how the chip is modelled or measured.
Answer in YAML following the template below.
If the paper discusses more than one circuit (e.g. two distinct variations, or a transmitter circuit and a receiver circuit),
add them as separate entries in the YAML list.
Do not add yaml quote to the output text.

Example YAML:
Circuit1:
    title: wavelength division demultiplexer
    components:
        - MZI_1 with a delta length of 200 micrometer
        - MZI_2 with a delta length of 100 micrometer
        - MZI_3 with a delta length of 100 micrometer
    assembly_instructions: |
        Take three MZIs each with one input ports and two output ports.
        Connect one output port of MZI_1 to the input port of MZI_2.
        Connect the other output port of MZI_1 to the the input port of MZI_3.
    brief_summary: |
        The silicon chip is fabricated using deep etch process.
        The chip was measured using a 1550 nm pulsed laser source."""

df["PreTemplate"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    if (row["circuit"] != "") & (row["circuit"] != "NO"):
        # print(row['full_text'])
        r = llm_api.call_openai(row["full_text"], sys_prompt=sys_prompt, model="gpt-4o")
        # print(r)
        df.at[index, "PreTemplate"] = r
        # sdfsdf

In [None]:
df.to_parquet("db/AMF_papers.parquet")

In [None]:
print(df.loc[238, "PreTemplate"])
# details on the layout
# what is the stack? standard SOI or SiN,
# what spatial and polarization mode is used? standard TE? or it's multimode?