# JARVIS Baseline Model w/Featurizer

see all datasets here: https://pages.nist.gov/jarvis/databases/

In [1]:
# Imports
import io
import sys
import toml
import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from jarvis.db.figshare import data


import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import importlib

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier


## Load TOML Configuration

In [2]:
# Configuration
CONFIG_PATH = "config.toml"
config = toml.load(CONFIG_PATH)

# Pretty print configuration
print("Project Configuration:")
pprint.pprint(config)

# Set up system path
SYS_PATH = config.get('system', {}).get('path', './')
sys.path.append(SYS_PATH) # .../code/jarvis/


Project Configuration:
{'data': {'dataset_name': 'dft_3d', 'store_dir': '/shared/data/jarvis'},
 'features': {'bag_of_elements': True,
              'derived': ['eps_mean', 'eps_std'],
              'use_columns': ['ehull',
                              'formation_energy_peratom',
                              'avg_elec_mass',
                              'avg_hole_mass',
                              'effective_masses_297K',
                              'epsx',
                              'epsy',
                              'epsz',
                              'natoms']},
 'filters': {'bandgap_column': 'optb88vdw_bandgap',
             'max_eps': 10.0,
             'min_eps': 1.0,
             'semiconductor_max': 4.0,
             'semiconductor_min': 0.5,
             'toxic_elements': ['Pb', 'Cd', 'Hg', 'As', 'Se'],
             'transparent_min': 3.0},
 'known': {'transparent_formulas': ['In2O3',
                                    'ZnO',
                                   

## Load Custom Libraries and Data, Start Logger

In [3]:
# Custom Imports and Configurations
from jarvis_utils import load_or_fetch_dataset
from logger_utils import setup_logger, flush_logger
from filter_utils import apply_filters
from featurizer import Featurizer

# Setup logger
logger = setup_logger(config)

logger.info("Project configuration loaded.")
logger.info(f"Dataset: {config['data']['dataset_name']}")
logger.info(f"Store directory: {config['data']['store_dir']}")

# Load dataset
df = load_or_fetch_dataset(config["data"]["dataset_name"], data, config["data"]["store_dir"])
logger.info(f"Dataset shape: {df.shape}")




2025-12-01 11:56:03,353 - jarvis_project - INFO - Project configuration loaded.
2025-12-01 11:56:03,355 - jarvis_project - INFO - Dataset: dft_3d
2025-12-01 11:56:03,355 - jarvis_project - INFO - Store directory: /shared/data/jarvis


Loading existing pickle file: /shared/data/jarvis/jarvis_dft_3d.pkl


2025-12-01 11:56:04,589 - jarvis_project - INFO - Dataset shape: (75993, 64)


Dataset shape: (75993, 64)


In [4]:
features = df.columns.tolist()
logger.info(f"Features: {features}")

2025-12-01 11:56:04,599 - jarvis_project - INFO - Features: ['jid', 'spg_number', 'spg_symbol', 'formula', 'formation_energy_peratom', 'func', 'optb88vdw_bandgap', 'atoms', 'slme', 'magmom_oszicar', 'spillage', 'elastic_tensor', 'effective_masses_300K', 'kpoint_length_unit', 'maxdiff_mesh', 'maxdiff_bz', 'encut', 'optb88vdw_total_energy', 'epsx', 'epsy', 'epsz', 'mepsx', 'mepsy', 'mepsz', 'modes', 'magmom_outcar', 'max_efg', 'avg_elec_mass', 'avg_hole_mass', 'icsd', 'dfpt_piezo_max_eij', 'dfpt_piezo_max_dij', 'dfpt_piezo_max_dielectric', 'dfpt_piezo_max_dielectric_electronic', 'dfpt_piezo_max_dielectric_ionic', 'max_ir_mode', 'min_ir_mode', 'n-Seebeck', 'p-Seebeck', 'n-powerfact', 'p-powerfact', 'ncond', 'pcond', 'nkappa', 'pkappa', 'ehull', 'Tc_supercon', 'dimensionality', 'efg', 'xml_data_link', 'typ', 'exfoliation_energy', 'spg', 'crys', 'density', 'poisson', 'raw_files', 'nat', 'bulk_modulus_kv', 'shear_modulus_gv', 'mbj_bandgap', 'hse_gap', 'reference', 'search']


## Explore numberic and categorical features, add descriptions for each.

In [5]:

# --- Feature audit utility ---
def audit_features(df: pd.DataFrame, descriptions: dict = None, n_examples: int = 1) -> pd.DataFrame:
    """
    Inspect dataframe columns, separate numeric vs categorical, and show example values + descriptions.
    
    Parameters
    ----------
    df : pd.DataFrame
        Original dataset.
    descriptions : dict
        Optional mapping {column_name: description}.
    n_examples : int
        Number of example values to show per column.
    
    Returns
    -------
    pd.DataFrame
        Summary table of features.
    """
    records = []
    for col in df.columns:
        dtype = df[col].dtype
        # get example values (first non-null)
        examples = df[col].dropna().head(n_examples).tolist()
        example_str = ", ".join(map(str, examples)) if examples else "NaN"
        desc = descriptions.get(col, "No description available") if descriptions else "No description available"
        records.append({
            "Feature": col,
            "Type": str(dtype),
            "Example": example_str,
            "Description": desc
        })
    return pd.DataFrame(records)

# Load descriptions
config = toml.load("features.toml")
feature_descriptions = config["features"]

audit_df = audit_features(df, descriptions=feature_descriptions, n_examples=2)

# Apply to audit table
audit_df["Description"] = audit_df["Feature"].apply(
    lambda f: feature_descriptions.get(f, "No description available")
)

pd.set_option("display.max_rows", None)   # show all rows
pd.set_option("display.max_columns", None)  # show all columns
pd.set_option("display.width", None)     # don't wrap columns
pd.set_option("display.max_colwidth", None)  # show full cell contents

display(audit_df)


Unnamed: 0,Feature,Type,Example,Description
0,jid,object,"JVASP-90856, JVASP-86097",Unique JARVIS material identifier
1,spg_number,object,"129, 221","Space group number (crystallographic symmetry, integer)"
2,spg_symbol,object,"P4/nmm, Pm-3m","Space group symbol (e.g., P4/nmm, Cmmm)"
3,formula,object,"TiCuSiAs, DyB6",Chemical formula of the material
4,formation_energy_peratom,float64,"-0.42762, -0.41596",Formation energy per atom (eV/atom)
5,func,object,"OptB88vdW, OptB88vdW","Exchange-correlation functional used in calculation (e.g., OptB88vdW)"
6,optb88vdw_bandgap,float64,"0.0, 0.0",Bandgap computed with optB88vdW functional (eV)
7,atoms,object,"{'lattice_mat': [[3.566933224304235, 0.0, -0.0], [0.0, 3.566933224304235, -0.0], [-0.0, -0.0, 9.397075454186664]], 'coords': [[2.6751975000000003, 2.6751975000000003, 7.376101754328542], [0.8917325, 0.8917325, 2.0209782456714573], [0.8917325, 2.6751975000000003, 4.69854], [2.6751975000000003, 0.8917325, 4.69854], [0.8917325, 2.6751975000000003, 0.0], [2.6751975000000003, 0.8917325, 0.0], [2.6751975000000003, 2.6751975000000003, 2.8894795605846353], [0.8917325, 0.8917325, 6.507600439415366]], 'elements': ['Ti', 'Ti', 'Cu', 'Cu', 'Si', 'Si', 'As', 'As'], 'abc': [3.56693, 3.56693, 9.39708], 'angles': [90.0, 90.0, 90.0], 'cartesian': True, 'props': ['', '', '', '', '', '', '', '']}, {'lattice_mat': [[4.089078911208881, 0.0, 0.0], [-0.0, 4.089078911208881, -0.0], [0.0, -0.0, 4.089078911208881]], 'coords': [[0.0, 0.0, 0.0], [0.8121488741123271, 2.04454, 2.04454], [2.04454, 3.276931125887674, 2.04454], [2.04454, 0.8121488741123271, 2.04454], [3.276931125887674, 2.04454, 2.04454], [2.04454, 2.04454, 0.8121488741123271], [2.04454, 2.04454, 3.276931125887674]], 'elements': ['Dy', 'B', 'B', 'B', 'B', 'B', 'B'], 'abc': [4.08908, 4.08908, 4.08908], 'angles': [90.0, 90.0, 90.0], 'cartesian': True, 'props': ['', '', '', '', '', '', '']}","Full atomic structure: lattice, coordinates, elements, angles"
8,slme,object,"na, na",Spectroscopic limited maximum efficiency (solar cell metric)
9,magmom_oszicar,object,"0.0, 0.0",Magnetic moment from OSZICAR file (µB)


### Featurization Plan


| Feature                     | Plan              | Notes                                                                 |
|-----------------------------|-------------------|-----------------------------------------------------------------------|
| jid                         | Drop              | Identifier only                                                       |
| spg_number                  | Numeric           | Convert to int; crystallographic symmetry                             |
| spg_symbol                  | Categorical       | One‑hot/frequency encode                                              |
| formula                     | Flatten           | Bag‑of‑elements indicators, composition vectors                       |
| formation_energy_peratom    | Numeric           | Keep as float                                                         |
| func                        | Categorical       | Encode functional used                                                |
| optb88vdw_bandgap           | Numeric           | Keep as float                                                         |
| atoms                       | Network structure | Use atomic graph featurizer (lattice, coords, elements)               |
| slme                        | Numeric           | Convert to float if available                                         |
| magmom_oszicar              | Numeric           | Magnetic moment scalar                                                |
| spillage                    | Numeric           | Topological indicator                                                 |
| elastic_tensor              | Flatten           | Extract Voigt averages, shear/bulk moduli                             |
| effective_masses_300K       | Flatten           | Extract electron/hole masses                                          |
| kpoint_length_unit          | Numeric           | Convert to int                                                        |
| maxdiff_mesh                | Drop              | Convergence diagnostic                                                |
| maxdiff_bz                  | Drop              | Convergence diagnostic                                                |
| encut                       | Numeric           | Plane‑wave cutoff energy                                              |
| optb88vdw_total_energy      | Numeric           | Keep as float                                                         |
| epsx, epsy, epsz            | Numeric + Combine | Compute mean/std; keep individual                                     |
| mepsx, mepsy, mepsz         | Numeric           | If available                                                          |
| modes                       | Flatten           | Extract mean/std/min/max of phonon modes                              |
| magmom_outcar               | Numeric           | Magnetic moment scalar                                                |
| max_efg                     | Numeric           | Electric field gradient                                               |
| avg_elec_mass, avg_hole_mass| Numeric           | Effective masses                                                      |
| icsd                        | Drop              | Identifier                                                            |
| dfpt_piezo_max_eij, dij     | Numeric           | Piezoelectric coefficients                                            |
| dfpt_piezo_max_dielectric   | Numeric           | Dielectric constant                                                   |
| dfpt_piezo_max_dielectric_electronic, ionic | Numeric | Contributions                                                         |
| max_ir_mode, min_ir_mode    | Numeric           | Phonon frequencies                                                    |
| n-Seebeck, p-Seebeck        | Numeric           | Seebeck coefficients                                                  |
| n-powerfact, p-powerfact    | Numeric           | Power factors                                                         |
| ncond, pcond                | Numeric           | Conductivities                                                        |
| nkappa, pkappa              | Numeric           | Thermal conductivities                                                |
| ehull                       | Numeric           | Stability metric                                                      |
| Tc_supercon                 | Numeric           | Superconducting Tc                                                    |
| dimensionality              | Categorical       | Encode (bulk, layered, etc.)                                          |
| efg                         | Flatten           | Tensor → scalar summaries                                             |
| xml_data_link               | Drop              | Identifier                                                            |
| typ                         | Categorical       | Encode (bulk, monolayer, etc.)                                        |
| exfoliation_energy          | Numeric           | Convert to float                                                      |
| spg                         | Drop              | Duplicate of spg_number                                               |
| crys                        | Categorical       | Encode crystal system                                                 |
| density                     | Numeric           | Keep as float                                                         |
| poisson                     | Numeric           | Convert to float                                                      |
| raw_files                   | Drop              | Structural artifact                                                   |
| nat                         | Numeric           | Number of atoms                                                       |
| bulk_modulus_kv, shear_modulus_gv | Numeric     | Mechanical properties                                                 |
| mbj_bandgap, hse_gap        | Numeric           | Bandgaps from other functionals                                       |
| reference                   | Drop              | External ID                                                           |
| search                      | Drop              | Search string artifact                                                |


#### Summary
 - Drop: jid, icsd, xml_data_link, reference, search, raw_files, convergence diagnostics (maxdiff_mesh, maxdiff_bz), duplicate spg.
 - Numeric (keep/convert): formation_energy_peratom, bandgaps, energies, dielectric constants, mechanical moduli, density, ehull, Tc_supercon, Seebeck, conductivities, etc.
- Categorical (encode): spg_symbol, func, dimensionality, typ, crys.
- Flatten/aggregate: atoms (-> graph), elastic_tensor, effective_masses, phonon modes, efg.
- Network structure: atoms (lattice, coords, elements) -> graph neural network input.
- Combine: dielectric constants (epsx/epsy/epsz -> mean/std), phonon modes (mean/std/min/max).

## Baseline Model using Featurizer

In [6]:
# Load TOML plan and apply
featurizer = Featurizer("features_plan.toml")


## Create Candidate Feature

In [7]:
def add_candidate_column(df: pd.DataFrame, config: dict) -> pd.DataFrame:
    df = df.copy()

    bandgap_col   = config["filters"]["bandgap_column"]
    sem_min       = config["filters"]["semiconductor_min"]
    sem_max       = config["filters"]["semiconductor_max"]
    trans_min     = config["filters"]["transparent_min"]
    toxic_elements = config["filters"]["toxic_elements"]

    df[bandgap_col] = pd.to_numeric(df[bandgap_col], errors="coerce")
    in_semiconductor_range = df[bandgap_col].between(sem_min, sem_max)
    is_transparent = df[bandgap_col] > trans_min

    if "ehull" in df.columns:
        df["ehull"] = pd.to_numeric(df["ehull"], errors="coerce")
        is_stable = df["ehull"] < 0.1
    else:
        is_stable = True

    if "formula" in df.columns:
        tokens = df["formula"].fillna("").astype(str).str.findall(r"[A-Z][a-z]?")
        has_toxic = tokens.apply(lambda t: any(el in t for el in toxic_elements))
    else:
        has_toxic = False

    df["is_candidate"] = (
        in_semiconductor_range &
        is_transparent &
        is_stable &
        (has_toxic == False)
    ).astype(int)

    return df


In [8]:
config = {
    "filters": {
        "bandgap_column": "optb88vdw_bandgap",
        "semiconductor_min": 0.5,
        "semiconductor_max": 5.0,
        "transparent_min": 2.5,
        "toxic_elements": ["Pb", "Cd", "As", "Hg"]
    }
}

df_feat = add_candidate_column(df, config)
df_feat = featurizer.apply_plan(df_feat)


  df.replace({"na": np.nan, "NA": np.nan, "NaN": np.nan, "": np.nan})


In [9]:

# print(df_feat[["optb88vdw_bandgap", "ehull", "is_candidate"]].head())
# df.head(1)
df_feat.is_candidate.value_counts()


is_candidate
0    71804
1     4189
Name: count, dtype: int64

In [None]:
X = df_feat.drop(columns=["is_candidate"], errors="ignore")
y = df_feat["is_candidate"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

numeric = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])
categorical = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preproc = ColumnTransformer(
    transformers=[
        ("num", numeric, numeric_cols),
        ("cat", categorical, cat_cols)
    ],
    remainder="drop"
)

clf = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
pipe = Pipeline(steps=[("preproc", preproc), ("clf", clf)])

#pipe.fit(X_train, y_train)
#val_score = pipe.score(X_val, y_val)
#print(f"Validation accuracy: {val_score:.3f}")


Validation accuracy: 0.992


In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

train_sizes, train_scores, val_scores = learning_curve(
    pipe, X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring="accuracy",
    n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

plt.plot(train_sizes, train_mean, label="Training accuracy")
plt.plot(train_sizes, val_mean, label="Validation accuracy")
plt.xlabel("Training set size")
plt.ylabel("Accuracy")
plt.title("Learning curve (RandomForest baseline)")
plt.legend()
plt.show()

print(f"Validation accuracy: {val_score:.3f}")


KeyboardInterrupt: 