## load texts into dataframe

In [None]:
import os

import fitz  # PyMuPDF
import pandas as pd

directory = "db/AMF/"

filenames = []
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        filenames.append({"filename": filename})
df = pd.DataFrame(filenames)


df["text_header"] = ""
df["N_pages"] = 0
df["text_full"] = ""
for index, row in df.iterrows():
    filepath = os.path.join(directory, row["filename"])

    doc = fitz.open(filepath)
    text = ""
    full_text = ""

    total_pages = len(doc)
    df.at[index, "N_pages"] = total_pages

    header_pages = 3
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        page_text = page.get_text()
        full_text += page_text

        # Store text from the first `n_pages` in header_text
        if page_num < header_pages:
            text += page_text

    df.at[index, "text_header"] = text
    df.at[index, "text_full"] = full_text

    doc.close()

df.to_parquet("db/AMF_papers.parquet")

print(df.info())

## get paper title by LLM

In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
from tqdm.notebook import tqdm

from PhotonicsAI.Photon import llm_api

# sys_prompt = 'Identify the main topics in this document.'
sys_prompt = "Identify and return the title of this document without modifications."

df["title_4omini"] = ""
for index, row in tqdm(df.iterrows(), total=len(df)):
    r = llm_api.call_openai(
        row["text_header"], sys_prompt=sys_prompt, model="gpt-4o-mini"
    )
    df.at[index, "title_4omini"] = r

In [None]:
df.to_parquet("db/AMF_papers.parquet")

## extract components

In [None]:
import sys

import pandas as pd
import yaml
from pydantic import BaseModel

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
from PhotonicsAI.Photon import llm_api


class ComponentsResponse(BaseModel):
    single_article: bool
    topic_photonic: bool
    components_list: list[str]


df = pd.read_parquet("db/AMF_papers.parquet")
df["single_article"] = None
df["topic_photonic"] = None
df["components_list"] = None

sys_prompt1 = """Is this a single academic article, and not a dissertation or collection of papers (single_article)?
Is the main topic of this article about integrated photonic circuits (topic_photonic)?
If yes, find the photonic components that are used on the chip.
Return a concise list of these photonic components, if any (components_list).
"""

for idx, row in df.iterrows():
    if row["N_pages"] < 20:
        print(idx, "=======================================")
        r = llm_api.callgpt_pydantic(row["text_full"], sys_prompt1, ComponentsResponse)
        print(yaml.dump(r.dict()))
        df.at[idx, "single_article"] = r.single_article
        df.at[idx, "topic_photonic"] = r.topic_photonic
        df.at[idx, "components_list"] = r.components_list

        # df.to_parquet('db/AMF_papers.parquet')

In [None]:
df.head()

In [None]:
all_comp_list = df["components_list"].dropna().explode().value_counts()
print(len(all_comp_list))

item_counts_df = all_comp_list.reset_index(name="count")

print(item_counts_df[100:150])

In [None]:
(clustered_sentences[12])

In [None]:
import sys

sys.path.append("C:/Users/vansari/Documents/PhotonicAI")
import pandas as pd
from langchain_openai import OpenAIEmbeddings

from PhotonicsAI.Photon import llm_api

embeddings_model = OpenAIEmbeddings()


df = pd.read_parquet("db/AMF_papers.parquet")
corpus = df["components_list"].dropna().explode().to_list()
corpus = [item.strip() for item in corpus if isinstance(item, str) and item.strip()]

corpus_embeddings = embeddings_model.embed_documents(corpus)

print(len(corpus_embeddings))
print(corpus_embeddings[0])

In [None]:
from sklearn.cluster import AgglomerativeClustering

# embedder = SentenceTransformer("all-mpnet-base-v2") # 203

# corpus = df['components_list'].dropna().explode().to_list()
# corpus_embeddings = embedder.encode(corpus)

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

corpus_clustered = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in corpus_clustered:
        corpus_clustered[cluster_id] = []

    corpus_clustered[cluster_id].append(corpus[sentence_id])

# for i, cluster in corpus_clustered.items():
#     print("Cluster ", i + 1)
#     print(cluster)
#     print("")

print("===============")
print(len(corpus_clustered))

In [None]:
from sklearn.decomposition import PCA

%matplotlib inline

# Reduce dimensionality to 2D using PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(corpus_embeddings)

# Plot the PCA result with cluster assignments
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_assignment, cmap="viridis"
)

# Add a legend and title
plt.colorbar(scatter)
plt.title("PCA visualization of clusters")
plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE

# Convert corpus_embeddings to a NumPy array if it's not already
corpus_embeddings = np.array(corpus_embeddings)

# Perform t-SNE to reduce embeddings to 2D
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(corpus_embeddings)

# Now, you can plot as before
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_assignment, cmap="viridis"
)
plt.colorbar(scatter)
plt.title("t-SNE visualization of clusters")
plt.show()

In [None]:
import ast

sys_prompt = """This is a list of photonic components.
Some of them are similar or synonymous.
Collapse all synonymous or redundant components into a single phrase.
Return only a valid Python list of strings with no additional text or formatting (like python quotes).
"""

corpus_clustered_distilled = {}

for i, cluster in corpus_clustered.items():
    print("===========", i)
    r = llm_api.call_openai(str(cluster), sys_prompt=sys_prompt, model="gpt-4o")
    # print(cluster)
    # print()
    # print(r)
    corpus_clustered_distilled[i] = ast.literal_eval(r.replace("–", "-"))
    print(len(cluster), len(ast.literal_eval(r)))
    print()

In [None]:
import copy

dis_ = copy.deepcopy(corpus_clustered_distilled)
max_len = max(len(lst) for lst in dis_.values())
for key, lst in dis_.items():
    if len(lst) < max_len:
        dis_[key] = lst + [None] * (max_len - len(lst))

df = pd.DataFrame(dis_)
df.to_csv("db/corpus_clustered_distilled.csv", index=False)

In [None]:
import pickle

data = {}
data["corpus"] = corpus
data["corpus_embeddings"] = corpus_embeddings
data["corpus_clustered"] = corpus_clustered
data["corpus_clustered_distilled"] = corpus_clustered_distilled

with open("db/AMF_components.pkl", "wb") as f:
    pickle.dump(data, f)

# with open('db/AMF_components.pkl', 'rb') as f:
# data = pickle.load(f)

In [None]:
import pickle

with open("db/AMF_components.pkl", "rb") as f:
    data = pickle.load(f)

for key, value in data.items():
    locals()[key] = value

# for i in corpus:
#     print(i)