# Script to plot relevant plots for Fig. 2 (NeurIPS AI4Mat)


In [None]:
import json
import os

from llm_synthesis.utils.style_utils import get_cmap, get_palette, set_style

cmap = get_cmap()
palette = get_palette()
set_style()

In [None]:
from datasets import load_dataset

ds = load_dataset("LeMaterial/LeMat-Synth")
ds

In [None]:
df = ds["sample_for_evaluation"].to_pandas()

df.head()

In [None]:
df["paper_published_date"].unique()

In [None]:
# get the first 4 numbers of the paper_published_date
df["paper_published_date"] = df["paper_published_date"].str[:4]

In [None]:
# df["source"] is arxiv when paper_url contains arxiv.org, chemrxiv when paper_url contains chemrxiv.org, else it is "omg24"

df["source"] = df["paper_url"].apply(
    lambda x: "arxiv"
    if "arxiv.org" in x
    else "chemrxiv"
    if "chemrxiv.org" in x
    else "omg24"
)

df.groupby("source").size()

In [None]:
df[["material_category", "synthesis_method", "paper_published_date", "source"]]

In [None]:
output_dir = os.getcwd()

# Define the full path for the output file
file_path = os.path.join(output_dir, "dataset_statistics_with_source.csv")


df[
    ["material_category", "synthesis_method", "paper_published_date", "source"]
].to_csv(file_path)

In [None]:
from datasets import load_dataset

ds_paper = load_dataset(
    "LeMaterial/LeMat-Synth-Papers", subset="full", split="arxiv"
)

df_paper = ds_paper.to_pandas()

df_paper.head()

In [None]:
df.columns

In [None]:
annotation_folder = (
    "/Users/magdalenalederbauer/Code/lematerial-llm-synthesis/annotations"
)

# for every subdir in annotation_folder
for subdir in os.listdir(annotation_folder):
    # id = name of subdir
    id = subdir
    id = id.replace("cond-mat.", "cond-mat/")
    synthesis_procedures_of_paper = df_paper[df_paper["id"] == id]
    url_of_paper = (
        synthesis_procedures_of_paper["pdf_url"]
        .values[0]
        .replace("https://", "")
    )
    matched_lemat_synth_entry = df[df["paper_url"].str.contains(url_of_paper)]
    if len(matched_lemat_synth_entry) == 0:
        continue
    if (
        matched_lemat_synth_entry["synthesized_material"].values[0]
        == "No materials synthesized"
    ):
        continue

    # result_llm = subdir/result.json
    result_llm = os.path.join(annotation_folder, subdir, "result.json")
    result_human = os.path.join(annotation_folder, subdir, "result_human.json")

    # load llm_ontology as json
    llm_ontology = json.loads(open(result_llm).read())
    try:
        human_ontology = json.loads(open(result_human).read())
    except FileNotFoundError:
        # print(f"No human ontology for {id}")
        # human ontology is a list of empty dicts in same format as llm_ontology
        human_ontology = [{} for _ in llm_ontology]

    for idx, (item_llm, item_human) in enumerate(
        zip(llm_ontology, human_ontology)
    ):
        mat_name = item_llm["material"]
        synthesis = item_llm["synthesis"]
        evaluation_llm = item_llm["evaluation"]
        evaluation_human = item_human["evaluation"] if item_human else None
        # fill the first row of matched_lemat_synth_entry with the values
        try:
            matched_lemat_synth_entry.iloc[idx] = {
                "synthesized_material": mat_name,
                "synthesis": synthesis,
                "synthesis_extraction_performance_llm": evaluation_llm,
                "synthesis_extraction_performance_human": evaluation_human,
            }
        except Exception:
            print(f"Error filling row {idx} of {id}")
            print(matched_lemat_synth_entry)
            print(mat_name)
            print(synthesis)
            print(evaluation_llm)
            print(evaluation_human)
            break

In [None]:
llm_ontology[0].keys()

In [None]:
matched_lemat_synth_entry