## load texts into dataframe

In [None]:
import os

import fitz  # PyMuPDF
import pandas as pd

directory = "db/AMF_selected/"

filenames = []
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filenames.append({"filename": filename})
df = pd.DataFrame(filenames)


df["text_header"] = ""
df["N_pages"] = 0
df["text_full"] = ""
for index, row in df.iterrows():
    filepath = os.path.join(directory, row["filename"])

    doc = fitz.open(filepath)
    text = ""
    full_text = ""

    total_pages = len(doc)
    df.at[index, "N_pages"] = total_pages

    header_pages = 3
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        page_text = page.get_text()
        full_text += page_text

        # Store text from the first `n_pages` in header_text
        if page_num < header_pages:
            text += page_text

    df.at[index, "text_header"] = text
    df.at[index, "text_full"] = full_text

    doc.close()

df.to_parquet("db/AMF_SelectedPapers.parquet")

print(df.info())

## get paper title by LLM

In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
from tqdm.notebook import tqdm

from PhotonicsAI.Photon import llm_api

# sys_prompt = 'Identify the main topics in this document.'
sys_prompt = "Identify and return the title of this document without modifications."

df["title_4omini"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    r = llm_api.call_openai(
        row["text_header"], sys_prompt=sys_prompt, model="gpt-4o-mini"
    )
    df.at[index, "title_4omini"] = r

In [None]:
df.to_parquet("db/AMF_SelectedPapers.parquet")

## extract components

In [None]:
import sys

import pandas as pd
import yaml
from pydantic import BaseModel

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
from PhotonicsAI.Photon import llm_api


class ComponentsResponse(BaseModel):
    single_article: bool
    topic_photonic: bool
    components_list: list[str]


df = pd.read_parquet("db/AMF_SelectedPapers.parquet")
df["single_article"] = None
df["topic_photonic"] = None
df["components_list"] = None

sys_prompt1 = """Is this a single academic article (single_article)?
Is the main topic of this article about integrated photonic circuits (topic_photonic)?
If yes, find the photonic components that are used on the chip.
Return a concise list of these photonic components, if any (components_list).
"""

for idx, row in df.iterrows():
    if row["N_pages"] < 20:
        print(idx, "=======================================")
        r = llm_api.callgpt_pydantic(row["text_full"], sys_prompt1, ComponentsResponse)
        print(yaml.dump(r.dict()))
        df.at[idx, "single_article"] = r.single_article
        df.at[idx, "topic_photonic"] = r.topic_photonic
        df.at[idx, "components_list"] = r.components_list

df.to_parquet("db/AMF_SelectedPapers.parquet")

In [None]:
df.head()