## load texts into dataframe

In [None]:
import os

import fitz  # PyMuPDF
import pandas as pd

directory = "db/AMF_selected/"

filenames = []
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filenames.append({"filename": filename})
filenames.sort(key=lambda x: int(x["filename"].split(".")[0]))

filenames = [filenames[i] for i in [0, 3, 4, 5, 9, 24, 30]]


df = pd.DataFrame(filenames)
df["text_header"] = ""
df["N_pages"] = 0
df["text_full"] = ""
for index, row in df.iterrows():
    filepath = os.path.join(directory, row["filename"])

    doc = fitz.open(filepath)
    text = ""
    full_text = ""

    total_pages = len(doc)
    df.at[index, "N_pages"] = total_pages

    header_pages = 3
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        page_text = page.get_text()
        full_text += page_text

        # Store text from the first `n_pages` in header_text
        if page_num < header_pages:
            text += page_text

    df.at[index, "text_header"] = text
    df.at[index, "text_full"] = full_text

    doc.close()

# df.to_parquet('db/AMF_SelectedPapers2.parquet')

print(df["filename"])
print()
print(df.info())

## entity extraction

In [None]:
import sys

sys.path.append("/Users/vahid/Downloads/PhotonicsAI_Project")
from dotenv import load_dotenv

from PhotonicsAI.Photon import llm_api

load_dotenv(dotenv_path="../../.env")
import pandas as pd
import yaml

df["PaperEntities1"] = None
for idx, row in df.iterrows():
    print()
    print(idx, "===============")
    article = df.loc[idx]["text_full"]

    r = llm_api.papers_entity_extraction(article)
    df.at[idx, "PaperEntities1"] = r

    print("topic_photonic", r["topic_photonic"])
    print("single_article", r["single_article"])
    print("circuit_complete", r["circuit_complete"])
    print("components_list", r["components_list"])

## extract components

In [None]:
import sys

import pandas as pd
from pydantic import BaseModel

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
from PhotonicsAI.Photon import llm_api


class ComponentsResponse(BaseModel):
    single_article: bool
    topic_photonic: bool
    components_list: list[str]


df = pd.read_parquet("db/AMF_SelectedPapers.parquet")
df["single_article"] = None
df["topic_photonic"] = None
df["components_list"] = None

sys_prompt1 = """Is this a single academic article (single_article)?
Is the main topic of this article about integrated photonic circuits (topic_photonic)?
If yes, find the photonic components that are used on the chip.
Return a concise list of these photonic components, if any (components_list).
"""

for idx, row in df.iterrows():
    if row["N_pages"] < 20:
        print(idx, "=======================================")
        r = llm_api.callgpt_pydantic(row["text_full"], sys_prompt1, ComponentsResponse)
        print(yaml.dump(r.dict()))
        df.at[idx, "single_article"] = r.single_article
        df.at[idx, "topic_photonic"] = r.topic_photonic
        df.at[idx, "components_list"] = r.components_list

df.to_parquet("db/AMF_SelectedPapers.parquet")

In [None]:
df.head()