# Exploring LeMat-Bulk (compatible_pbe)

This notebook shows how to:
- Stream the `LeMaterial/LeMat-Bulk` dataset (config: `compatible_pbe`)
- Peek at the schema/features
- Materialize a small sample to a DataFrame for quick iteration
- Optionally save a local Parquet snapshot for faster repeated exploration

Notes:
- The full dataset is large (~5.34M rows). Use streaming and small samples.
- Adjust `SAMPLE_SIZE` as needed.


In [None]:
from itertools import islice

import pandas as pd
from datasets import load_dataset

pd.set_option('display.max_columns', None)
DATASET = "LeMaterial/LeMat-Bulk"
CONFIG = "compatible_pbe"

ds_stream = load_dataset(DATASET, name=CONFIG, split="train", streaming=True)
rows = list(islice(ds_stream, 100))
df = pd.DataFrame(rows)
df.head()

In [None]:
df.columns

In [None]:
from typing import Any, Dict

import numpy as np
from pymatgen.core import Lattice, Structure


def row_to_structure(row):
    lattice = Lattice(row["lattice_vectors"])

    # positions: prefer cartesian
    if row.get("cartesian_site_positions"):
        coords = row["cartesian_site_positions"]
        cart = True
    else:
        coords = row["fractional_site_positions"]
        cart = False

    # species: prefer species_at_sites
    species_raw = row.get("species_at_sites", row.get("species"))
    species = [s["element"] if isinstance(s, dict) and "element" in s else s for s in species_raw]

    # sanity check
    assert len(species) == len(coords), "species/coords length mismatch"

    return Structure(lattice=lattice, species=species, coords=np.asarray(coords), coords_are_cartesian=cart)

In [None]:
structure = row_to_structure(df.iloc[0])

In [None]:
structure

In [None]:
from material_hasher.hasher.bawl import BAWLHasher
from pymatgen.analysis.local_env import EconNN

fingerprinter = BAWLHasher(
                graphing_algorithm="WL",
                bonding_algorithm=EconNN,
                bonding_kwargs={
                    "tol": 0.2,
                    "cutoff": 10,
                    "use_fictive_radius": True,
                },
                include_composition=True,
                symmetry_labeling="SPGLib",
                shorten_hash=False,
            )
fp = fingerprinter.get_material_hash(structure)

In [None]:
fp == df["entalpic_fingerprint"][0]