# **LeMaterial/LeMat-Synth Dataset Analysis**


This notebook is a data analysis of LeMaterial/LeMat-Synth dataset


## **Available Dataset Splits**

- **arxiv**: ArXiv research papers
- **chemrxiv**: ChemRxiv chemistry papers
- **omg24**: OMG24 conference papers
- **sample_for_evaluation**: Evaluation samples


## **Load libraries**


In [None]:
# Import required libraries
import ast
import json
import re
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud

from llm_synthesis.utils.style_utils import set_style

set_style()

warnings.filterwarnings("ignore")

# Remove the limit from the number of displayed columns and rows. It helps to see the entire dataframe while printing it
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## **Data set exploration**


### **Understand the data**


Create new column 'source'


Examine the columns, data types, and basic statistics.


In [None]:
from datasets import load_dataset

dataset = load_dataset("LeMaterial/LeMat-Synth", split="sample_for_evaluation")

data = dataset.to_pandas()
data.head()

In [None]:
# Extract the domain name from the URL
data["source"] = data["paper_url"].astype(str).str.extract(r"//([^/]+)")

# Cleaning: remove extensions
data["source"] = (
    data["source"]
    .str.replace(r"^www\.", "", regex=True)
    .str.replace(r"^onlinelibrary\.", "", regex=True)
    .str.replace(r"^link\.", "", regex=True)
    .str.replace(r"\.org$|\.com$|\.net$|\.edu$|\.info$", "", regex=True)
)

print(data[["paper_url", "source"]].head())

In [None]:
print(data.shape)

In [None]:
# Check the datatypes of each column.
data.info()

In [None]:
# Numerical statistics
data.describe()

### **Handle missing values**


Identify and address any missing values in the DataFrame.


In [None]:
missing_percentages = data.isnull().sum()
print("Percentage of missing values per column:")
print(missing_percentages)
if "images" in data.columns:
    data = data.drop("images", axis=1)
    print("\n'images' column dropped.")

# Fill remaining missing values with "N/A"
data = data.fillna("N/A")
print("\nRemaining missing values filled with 'N/A'.")

# Verify that there are no more missing values
print("\nMissing values after handling:")
print(data.isnull().sum().sum())

In [None]:
# drop all rows where synthesized_material == "No materials synthesized"
data = data[data["synthesized_material"] != "No materials synthesized"]

## **Data Visualisation**


Create visualizations to understand the distribution and relationships within the data.


### **Summary statistics of all categorical variables**


In [None]:
# Explore basic summary statistics of categorical variables.
data.describe(include=["object"])

### **Word cloud of actions**


In [None]:
# Extracting the actions from each step (simplified loop)
actions = []
for idx, raw in enumerate(data["structured_synthesis"]):
    try:
        record = json.loads(raw) if isinstance(raw, str) else raw
        for step in record.get("steps", []):
            action = step.get("action")
            if isinstance(action, str) and action:
                actions.append(action)
    except (json.JSONDecodeError, TypeError):
        continue

print(f"Total actions extracted: {len(actions)}")

# Frequency calculation
freq = Counter(actions)

# Preparation and generation of the word cloud
if freq:
    wc = WordCloud(
        width=800,
        height=400,
        background_color="white",
        colormap="tab10",
        relative_scaling=0.5,
        normalize_plurals=False,
    )
    wc.generate_from_frequencies(freq)

    # Display
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(
        "Word cloud of summary actions (size ‚àù frequency))", fontsize=18, pad=20
    )
    plt.show()
else:
    print("No actions found to generate a word cloud.")

In [None]:
# Create bar plot sorted by frequency (descending order)
plt.figure(figsize=(30, 6))

sorted_categories = sorted(freq.keys(), key=freq.get, reverse=True)

sns.barplot(x=list(freq.keys()), y=list(freq.values()), order=sorted_categories)
plt.xticks(rotation=45, ha="right")
plt.xlabel("Materials/Processes")
plt.ylabel("Frequency")
plt.title("Frequency of Synthesized Materials (Using order parameter)")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

# Sort by frequency (descending) - most frequent first
sorted_items = freq.most_common()
categories = [item[0] for item in sorted_items]
values = [item[1] for item in sorted_items]

sns.barplot(x=categories, y=values)
plt.xticks(rotation=45, ha="right")  # ha='right' for better alignment
plt.xlabel("Materials/Processes")
plt.ylabel("Frequency")
plt.title("Frequency of Synthesized Materials (Sorted by Frequency)")
plt.tight_layout()  # Prevents label cutoff
plt.show()

### **Distribution of source**


In [None]:
# Create a countplot of the 'source' column
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x="source")
plt.title("Distribution of Data Sources")
plt.show()

In [None]:
data["source"].value_counts()

### **Top of synthetized materials**


In [None]:
# Simple cleaning function
def clean_material(material):
    if isinstance(material, str):
        material = re.sub(r"[^a-zA-Z0-9\s]", "", material).strip().capitalize()
        # Exclude generic terms, missing values, and "No materials synthesized"
        if material.lower() not in [
            "na",
            "n/a",
            "other",
            "others",
            "no materials synthesized",
        ]:
            return material
    return None


# Apply cleaning to 'synthesized_material'
data["cleaned_synthesized_material"] = data["synthesized_material"].apply(
    clean_material
)

# Count the occurrences of valid synthesized materials
material_counts = Counter(data["cleaned_synthesized_material"].dropna())

# Convert to DataFrame for visualisation
df_materials = pd.DataFrame(
    material_counts.most_common(15), columns=["Synthesized Material", "Count"]
)

# Displaying with Seaborn
plt.figure(figsize=(12, 7))
sns.barplot(data=df_materials, x="Count", y="Synthesized Material")
plt.title("Top 15 Most Frequent Synthesized Materials")
plt.xlabel("Number of Occurrences")
plt.ylabel("Synthesized Material")
plt.tight_layout()
plt.show()

### **Frenquency of material categories**


In [None]:
# Simplified function for cleaning a single category
def clean_category(cat):
    if isinstance(cat, str):
        cat = re.sub(r"[^a-zA-Z0-9\s]", "", cat).strip()
        cat = cat.capitalize()
        if cat.lower() not in [
            "material",
            "science",
            "others",
            "n/a",
            "other",
            "na",
        ]:  # Filters out irrelevant cases
            return cat
    return None


# Apply cleaning
data["cleaned_category"] = data["material_category"].apply(clean_category)

# Count the occurrences of valid categories
category_counts = Counter(data["cleaned_category"].dropna())

# Convert to DataFrame for visualisation
df_categories = pd.DataFrame(
    category_counts.most_common(15), columns=["Category", "Count"]
)

# Displaying with Seaborn
plt.figure(figsize=(12, 7))
sns.barplot(data=df_categories, x="Count", y="Category")
plt.title("Top 15 Material Categories")
plt.xlabel("Number of Occurrences")
plt.ylabel("Category")
plt.tight_layout()
plt.show()

In [None]:
# Combine categories from all sources with category data
all_categories = []
# Using the main 'data' DataFrame and iterating through rows
if "data" in globals():
    for index, row in data.iterrows():
        # Use the cleaned categories if available, otherwise clean them here
        categories = row.get("cleaned_categories")
        if categories is None or not isinstance(categories, list):
            # If not available or not a list, clean from the original 'categories' column
            # Re-using the cleaning logic from elsewhere for consistency
            def clean_single_category_entry(categories_list):
                cleaned = []
                if isinstance(categories_list, str):
                    try:
                        categories_list = ast.literal_eval(categories_list)
                    except (ValueError, SyntaxError):
                        categories_list = [categories_list]

                if isinstance(categories_list, list):
                    for category in categories_list:
                        if isinstance(category, str):
                            for part in category.split(","):
                                for sub_part in part.split(";"):
                                    cleaned_category = sub_part.strip()
                                    if cleaned_category:
                                        cleaned.append(cleaned_category)
                return cleaned

            categories = clean_single_category_entry(row["material_category"])

        # Filter out 'N/A', 'Other', 'Others', 'NA' (case-insensitive)
        filtered_categories = [
            cat
            for cat in categories
            if isinstance(cat, str)
            and cat.lower() not in ["n/a", "other", "others", "na"]
        ]
        all_categories.extend(filtered_categories)

if all_categories:
    # Count the frequency of each category
    category_counts = Counter(all_categories)
    category_df = pd.DataFrame(
        category_counts.items(), columns=["Category", "Count"]
    )

    # Create a treemap
    fig = px.treemap(
        category_df,
        path=["Category"],
        values="Count",
        title="Interactive Treemap of Material Categories",
    )
    fig.show()
else:
    print("No category data available to create a treemap after filtering.")

### **Top of synthesis methods**


In [None]:
# Apply cleaning to 'synthesis_method'
data["cleaned_synthesis_method"] = data["synthesis_method"].apply(
    clean_category
)

# Count the occurrences of valid synthesis methods
method_counts = Counter(data["cleaned_synthesis_method"].dropna())

# Convert to DataFrame for visualisation
df_methods = pd.DataFrame(
    method_counts.most_common(15), columns=["Method", "Count"]
)

# Displaying with Seaborn
plt.figure(figsize=(12, 7))
sns.barplot(data=df_methods, x="Count", y="Method")
plt.title("Top 15 Synthesis Methods")
plt.xlabel("Number of Occurrences")
plt.ylabel("Method")
plt.tight_layout()
plt.show()

## **Advanced data exploration**


Perform a deeper dive into the dataset by exploring relationships between columns.


### **Correlation between synthesized materials and material categories**


In [None]:
# Easy cleaning
def clean(text):
    if isinstance(text, str):
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text).strip().capitalize()
        if text.lower() not in [
            "material",
            "science",
            "others",
            "other",
            "n/a",
            "na",
        ]:
            return text
    return None


# Clean both columns
data["clean_material"] = data["material_category"].apply(clean)
data["clean_synth"] = data["synthesized_material"].apply(clean)

# Delete rows where either of the two is empty
df = data.dropna(subset=["clean_material", "clean_synth"])

# Group and count pairs
grouped = (
    df.groupby(["clean_synth", "clean_material"])
    .size()
    .reset_index(name="count")
)

# Choose the best synthesised materials to avoid overload
top_synths = (
    grouped.groupby("clean_synth")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(15)
    .index
)
filtered = grouped[grouped["clean_synth"].isin(top_synths)]

# Grouped plot
plt.figure(figsize=(16, 8))
sns.barplot(
    data=filtered,
    x="clean_synth",
    y="count",
    hue="clean_material",
    palette="tab10",
    dodge=False,
)

plt.title("Top 15 Synthesized Materials by Material Category")
plt.xlabel("Synthesized Material")
plt.ylabel("Number of Occurrences")
plt.xticks(rotation=90, ha="center")

plt.legend(
    title="Material Category", bbox_to_anchor=(1.05, 1), loc="upper left"
)
plt.tight_layout()
plt.show()

### **Correlation between synthesized materials and synthesis methods**


In [None]:
data["clean_method"] = data["synthesis_method"].apply(clean)

# Create a crosstab (pivot table)
pivot_table = pd.crosstab(data["clean_synth"], data["clean_method"])

# Get the top 15 most synthesized materials
top_synths = pivot_table.sum(axis=1).sort_values(ascending=False).head(15).index

# Filter the pivot table to include only the top 15 materials
filtered_pivot_table = pivot_table.loc[top_synths]

# Display as a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(filtered_pivot_table, linewidths=0.5, annot=False)
plt.title("Heatmap: Top 15 Synthesized Materials vs Synthesis Method")
plt.xlabel("Synthesis Method")
plt.ylabel("Synthesized Material")
plt.tight_layout()
plt.show()

In [None]:
# Delete rows where either of the two is empty
df = data.dropna(subset=["clean_method", "clean_synth"])

# Group and count pairs
grouped = (
    df.groupby(["clean_synth", "clean_method"]).size().reset_index(name="count")
)

# Choose the best synthesised materials to avoid overload
top_synths = (
    grouped.groupby("clean_synth")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(15)
    .index
)
filtered = grouped[grouped["clean_synth"].isin(top_synths)]

# 1. Pivot the data to a "wide" format suitable for stacking
pivot_df = filtered.pivot_table(
    index="clean_synth",
    columns="clean_method",
    values="count",
    fill_value=0,  # Fills in 0 for any material/method pair that doesn't exist
)

# 2. Reorder the data to match the top 15 sorted list
pivot_df = pivot_df.reindex(top_synths)

# 3. Create the stacked bar plot from the new pivoted DataFrame
ax = pivot_df.plot(
    kind="bar", stacked=True, figsize=(16, 8), width=0.8, colormap="tab20"
)

plt.title("Top 15 Synthesized Materials by Synthesis Methods", fontsize=16)
plt.xlabel("Synthesized Material", fontsize=12)
plt.ylabel("Number of Occurrences", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.legend(
    title="Synthesis Methods", bbox_to_anchor=(1.02, 1), loc="upper left"
)
plt.tight_layout()
plt.show()

In [None]:
# --- TRANSFORMATION TO INVERTED STACKED CHART ---

# 1. Invert the pivot table üîÑ
# The methods are now the rows (x-axis) and materials are the columns (legend).
pivot_df_swapped = filtered.pivot_table(
    index="clean_method", columns="clean_synth", values="count", fill_value=0
)

# 2. Optional but recommended: Sort the methods by total occurrences
method_order = pivot_df_swapped.sum(axis=1).sort_values(ascending=False).index
pivot_df_swapped = pivot_df_swapped.reindex(method_order)


# 3. Create the stacked bar plot from the new pivoted DataFrame
ax = pivot_df_swapped.plot(
    kind="bar", stacked=True, figsize=(16, 8), width=0.8, colormap="tab20"
)

plt.title("Composition of Top 15 Materials by Synthesis Method", fontsize=16)
plt.xlabel("Synthesis Method", fontsize=12)
plt.ylabel("Number of Occurrences", fontsize=12)
plt.xticks(rotation=90, ha="center")
plt.legend(
    title="Synthesized Material", bbox_to_anchor=(1.02, 1), loc="upper left"
)
plt.tight_layout()
plt.show()

### **Synthesis Methods by source**


In [None]:
# Clean synthesis methods
def clean_method(method):
    if isinstance(method, str):
        method = re.sub(r"[^a-zA-Z0-9\s]", "", method).strip().capitalize()
        if method.lower() not in ["na", "n/a", "other", "others"]:
            return method
    return None


# Clean source names
data["clean_method"] = data["synthesis_method"].apply(clean_method)
data["clean_source"] = (
    data["source"]
    .astype(str)
    .str.replace(
        r"^www\.|^onlinelibrary\.|\.(org|com|net|edu|info)$", "", regex=True
    )
)

# Drop rows with missing values
df = data.dropna(subset=["clean_method", "clean_source"])

# Group and count occurrences
grouped = (
    df.groupby(["clean_method", "clean_source"])
    .size()
    .reset_index(name="count")
)

# --- FILTERING AND PIVOTING ---

# 1. Identify top 10 most common methods
top_methods_order = (
    grouped.groupby("clean_method")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# 2. Filter to top methods only
filtered = grouped[grouped["clean_method"].isin(top_methods_order)]

# 3. Pivot to wide format (methods as rows, sources as columns)
pivot_df = filtered.pivot_table(
    index="clean_method", columns="clean_source", values="count", fill_value=0
)

# 4. Reorder methods in the DataFrame
pivot_df = pivot_df.reindex(top_methods_order)

# --- PLOTTING STACKED BAR CHART ---

# Create the stacked bar plot
ax = pivot_df.plot(
    kind="bar", stacked=True, figsize=(16, 8), width=0.8, colormap="Set2"
)

# Add labels and styling
plt.title("Top 10 Synthesis Methods by Source", fontsize=16)
plt.xlabel("Synthesis Method", fontsize=12)
plt.ylabel("Number of Publications", fontsize=12)
plt.xticks(rotation=90, ha="center")
plt.legend(title="Source", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
data["clean_method"] = data["synthesis_method"].apply(clean_method)
data["clean_source"] = (
    data["source"]
    .astype(str)
    .str.replace(
        r"^www\.|^onlinelibrary\.|\.(org|com|net|edu|info)$", "", regex=True
    )
)

df = data.dropna(subset=["clean_method", "clean_source"])
grouped = (
    df.groupby(["clean_method", "clean_source"])
    .size()
    .reset_index(name="count")
)

# 1. We still identify the top 10 overall methods to keep the chart focused
top_methods_order = (
    grouped.groupby("clean_method")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# 2. Filter data to include only publications using one of these top 10 methods
filtered = grouped[grouped["clean_method"].isin(top_methods_order)]

# 3. Invert the pivot table üîÑ
# Now, the sources are the rows (index) and the methods are the columns.
pivot_df_swapped = filtered.pivot_table(
    index="clean_source", columns="clean_method", values="count", fill_value=0
)

# 4. Optional but recommended: Sort the sources by total number of publications
source_order = pivot_df_swapped.sum(axis=1).sort_values(ascending=False).index
pivot_df_swapped = pivot_df_swapped.reindex(source_order)


# 5. Plot the inverted data as a stacked bar chart
ax = pivot_df_swapped.plot()
kind = ("bar",)
stacked = (True,)
figsize = ((16, 8),)
width = (0.8,)
colormap = "tab20"

# Styling the plot with updated labels
plt.title("Composition of Top 10 Synthesis Methods by Source", fontsize=16)
plt.xlabel("Source", fontsize=12)
plt.ylabel("Number of Publications", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.legend(title="Synthesis Method", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

### **Synthesized Materials by source**


In [None]:
# Clean synthesis methods
def clean_synth(synth):
    if isinstance(synth, str):
        synth = re.sub(r"[^a-zA-Z0-9\s]", "", synth).strip().capitalize()
        if synth.lower() not in ["na", "n/a", "other", "others"]:
            return synth
    return None


# Clean source names
data["clean_synth"] = data["synthesized_material"].apply(clean_synth)
data["clean_source"] = (
    data["source"]
    .astype(str)
    .str.replace(
        r"^www\.|^onlinelibrary\.|\.(org|com|net|edu|info)$", "", regex=True
    )
)

# Drop rows with missing values
df = data.dropna(subset=["clean_synth", "clean_source"])

# Group and count occurrences
grouped = (
    df.groupby(["clean_synth", "clean_source"]).size().reset_index(name="count")
)

# --- FILTERING AND PIVOTING ---

# 1. Identify top 10 most common synthesized materials
top_synth_order = (
    grouped.groupby("clean_synth")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# 2. Filter to top synthesized materials only
filtered = grouped[grouped["clean_synth"].isin(top_synth_order)]

# 3. Pivot to wide format (synthesized materials as rows, sources as columns)
pivot_df = filtered.pivot_table(
    index="clean_synth", columns="clean_source", values="count", fill_value=0
)

# 4. Reorder synthesized materials in the DataFrame
pivot_df = pivot_df.reindex(top_synth_order)

# Create the stacked bar plot
ax = pivot_df.plot(
    kind="bar", stacked=True, figsize=(16, 8), width=0.8, colormap="Set2"
)

# Add labels and styling
plt.title("Top 10 Synthesized Materials by Source", fontsize=16)
plt.xlabel("Synthesized Materials", fontsize=12)
plt.ylabel("Number of Publications", fontsize=12)
plt.xticks(rotation=90, ha="center")
plt.legend(title="Source", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
data["clean_synth"] = data["synthesized_material"].apply(clean_synth)
data["clean_source"] = (
    data["source"]
    .astype(str)
    .str.replace(
        r"^www\.|^onlinelibrary\.|\.(org|com|net|edu|info)$", "", regex=True
    )
)

df = data.dropna(subset=["clean_synth", "clean_source"])
grouped = (
    df.groupby(["clean_synth", "clean_source"]).size().reset_index(name="count")
)

# 1. We still identify the top 10 overall methods to keep the chart focused
top_synth_order = (
    grouped.groupby("clean_synth")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# 2. Filter data to include only publications using one of these top 10 methods
filtered = grouped[grouped["clean_synth"].isin(top_synth_order)]

# 3. Invert the pivot table üîÑ
# Now, the sources are the rows (index) and the synthesized materials are the columns.
pivot_df_swapped = filtered.pivot_table(
    index="clean_source", columns="clean_synth", values="count", fill_value=0
)

# 4. Optional but recommended: Sort the sources by total number of publications
source_order = pivot_df_swapped.sum(axis=1).sort_values(ascending=False).index
pivot_df_swapped = pivot_df_swapped.reindex(source_order)


# 5. Plot the inverted data as a stacked bar chart
ax = pivot_df_swapped.plot()
kind = ("bar",)
stacked = (True,)
figsize = ((16, 8),)
width = (0.8,)
colormap = "tab20"

# Styling the plot with labels
plt.title("Composition of Top 10 Synthesized Materials by Source", fontsize=16)
plt.xlabel("Source", fontsize=12)
plt.ylabel("Number of Publications", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.legend(
    title="Synthesized Materials", bbox_to_anchor=(1.02, 1), loc="upper left"
)
plt.tight_layout()
plt.show()