## Rename PDF files

In [None]:
import os

directory = "db/AMF_selected"
pdf_files = [f for f in os.listdir(directory) if f.endswith(".pdf")]
for i, filename in enumerate(pdf_files):
    new_name = f"{i+1:03}.pdf"  # Format the new name with leading zeros
    old_file = os.path.join(directory, filename)
    new_file = os.path.join(directory, new_name)
    os.rename(old_file, new_file)

print("Renaming completed.")

## load texts into dataframe

In [None]:
import os

import fitz  # PyMuPDF
import pandas as pd

directory = "db/AMF_selected/"

filenames = []
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filenames.append({"filename": filename})
df = pd.DataFrame(filenames)


df["text_header"] = ""
df["N_pages"] = 0
df["text_full"] = ""
for index, row in df.iterrows():
    filepath = os.path.join(directory, row["filename"])

    doc = fitz.open(filepath)
    text = ""
    full_text = ""

    total_pages = len(doc)
    df.at[index, "N_pages"] = total_pages

    header_pages = 3
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        page_text = page.get_text()
        full_text += page_text

        # Store text from the first `n_pages` in header_text
        if page_num < header_pages:
            text += page_text

    df.at[index, "text_header"] = text
    df.at[index, "text_full"] = full_text

    doc.close()

print(df.info())

## get paper title by LLM

In [None]:
import sys

sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
from tqdm.notebook import tqdm

from PhotonicsAI.Photon import llm_api

# sys_prompt = 'Identify the main topics in this document.'
sys_prompt = "Identify and return the title of this document without modifications."

df["title_4omini"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    r = llm_api.call_openai(
        row["text_header"], sys_prompt=sys_prompt, model="gpt-4o-mini"
    )
    df.at[index, "title_4omini"] = r

In [None]:
df.to_parquet("db/AMF_SelectedPapers.parquet")

## extract pre-templates by LLM (1)

In [None]:
import sys

import pandas as pd

sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
from PhotonicsAI.Photon import llm_api

# df.sort_values(by='filename', inplace=True)
# df.reset_index(drop=True, inplace=True)
# df.to_parquet('db/AMF_SelectedPapers.parquet')


df = pd.read_parquet("db/AMF_SelectedPapers.parquet")

print(df.loc[0]["filename"])

In [None]:
sys_prompt0 = """This is a scientific article about an integrated photonic circuit.
Based on the input article, extract the title of the implemented circuit,
the primitive components (these are on-chip components only),
the instructions how to assemble the components into the circuit layout,
and a short summary of how the chip is modelled or measured.
Answer in YAML following the template below.
If the paper discusses more than one circuit (e.g. two distinct variations, or a transmitter circuit and a receiver circuit),
add them as separate entries in the YAML list.
Do not add yaml quote to the output text.

Example YAML:
Circuit1:
    title: wavelength division demultiplexer
    components:
        - MZI_1 with a delta length of 200 micrometer
        - MZI_2 with a delta length of 100 micrometer
        - MZI_3 with a delta length of 100 micrometer
    assembly_instructions: |
        Take three MZIs each with one input ports and two output ports.
        Connect one output port of MZI_1 to the input port of MZI_2.
        Connect the other output port of MZI_1 to the the input port of MZI_3.
    brief_summary: |
        The silicon chip is fabricated using deep etch process.
        The chip was measured using a 1550 nm pulsed laser source."""


sys_prompt1 = """This scientific article includes some information about a photonic integrated circuit.

(Keep Category) I am only interested in the photonic integrated circuit and
    - the used material stack,
    - the layout design,
    - it's building blocks and primitive components,
    - how the individual components are connected on chip.

(Discard Category) However, the paper probably also include other information that I am not interested in e.g.
    - detail methods of nanofabrication,
    - engineering of a larger system,
    - engineering of any system off-chip,
    - how the experiemnt and the measurement were carried out,
    - author information, acknowledgements, and references,
    - etc.

Read the text. Discard any text from Discard Category and keep text related to Keep Category. Do not rephrase anything, just filter it.
"""


# 5, 9, 30
# print(df.loc[4]['filename'])
r = llm_api.call_openai(
    df.loc[0]["text_full"], sys_prompt=sys_prompt1, model="gpt-4o-mini"
)
print(r)

In [None]:
sys_prompt1 = """This is an academic article. Read the text.
Does it include any information about a photonic integrated circuit?
If yes, find the photonic components that are on the photonic chip.
Return a consise list of these photonic components (if any).
Output the list in a YAML format. Do not add yaml quote to the output text.
"""

for i in range(df.shape[0]):
    r = llm_api.call_openai(
        df.loc[i]["text_full"], sys_prompt=sys_prompt1, model="gpt-4o-mini"
    )
    print(i, "=======================================")
    print(r)

In [None]:
for i in range(df.shape[0]):
    r = llm_api.call_openai(
        df.loc[i]["text_full"], sys_prompt=sys_prompt1, model="gpt-4o"
    )
    print(i, "=======================================")
    print(r)

In [None]:
from openai import OpenAI
from pydantic import BaseModel

sys_prompt1 = """This is an academic article. Read the text.
Does it include any information about a photonic integrated circuit?
If yes, find the photonic components that are on the photonic chip.
Return a consise list of these photonic components (if any).
"""


class ComponentsResponse(BaseModel):
    contains_photonic_circuit: bool
    components_list: list[str]


client = OpenAI()

for i in range(df.shape[0]):
    print(i, "=======================================")

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": sys_prompt1},
            {"role": "user", "content": df.loc[i]["text_full"]},
        ],
        response_format=ComponentsResponse,
    )

    message = completion.choices[0].message
    if message.parsed:
        # print(message.parsed.components)
        print(message.parsed.contains_photonic_circuit)
        print("\n".join(message.parsed.components_list))
    else:
        print(message.refusal)

In [None]:
import fitz  # PyMuPDF
from openai import OpenAI
from pydantic import BaseModel

filepath = "db/AMF_selected/test2.pdf"


text_full = ""
doc = fitz.open(filepath)
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    page_text = page.get_text()
    text_full += page_text
doc.close()


sys_prompt1 = """This is a text extracted from an article. Read the text.
Is the main topic of this article about an integrated photonic circuit?
If yes, find the photonic components that are on the photonic chip.
Return a consise list of these photonic components (if any).
"""


class ComponentsResponse(BaseModel):
    topic_photonic: bool
    components_list: list[str]


client = OpenAI()

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": sys_prompt1},
        {"role": "user", "content": text_full},
    ],
    response_format=ComponentsResponse,
)

message = completion.choices[0].message
if message.parsed:
    # print(message.parsed.components)
    print(message.parsed.topic_photonic)
    print("\n".join(message.parsed.components_list))
else:
    print(message.refusal)

In [None]:
message.parsed