# JARVIS Baseline Model

see all datasets here: https://pages.nist.gov/jarvis/databases/

In [1]:
# Imports
import io
import sys
import toml
import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from jarvis.db.figshare import data


import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import importlib

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

## Load TOML Configuration

In [2]:
# Configuration
CONFIG_PATH = "config.toml"
config = toml.load(CONFIG_PATH)

# Pretty print configuration
print("Project Configuration:")
pprint.pprint(config)

# Set up system path
SYS_PATH = config.get('system', {}).get('path', './')
sys.path.append(SYS_PATH) # .../code/jarvis/


Project Configuration:
{'data': {'dataset_name': 'dft_3d', 'store_dir': '/shared/data/jarvis'},
 'features': {'bag_of_elements': True,
              'derived': ['eps_mean', 'eps_std'],
              'use_columns': ['ehull',
                              'formation_energy_peratom',
                              'avg_elec_mass',
                              'avg_hole_mass',
                              'effective_masses_297K',
                              'epsx',
                              'epsy',
                              'epsz',
                              'natoms']},
 'filters': {'bandgap_column': 'optb88vdw_bandgap',
             'max_eps': 10.0,
             'min_eps': 1.0,
             'semiconductor_max': 4.0,
             'semiconductor_min': 0.5,
             'toxic_elements': ['Pb', 'Cd', 'Hg', 'As', 'Se'],
             'transparent_min': 3.0},
 'known': {'transparent_formulas': ['In2O3',
                                    'ZnO',
                                   

In [3]:
# Custom Imports and Configurations
from jarvis_utils import load_or_fetch_dataset
from logger_utils import setup_logger, flush_logger
from filter_utils import apply_filters

# Setup logger
logger = setup_logger(config)

logger.info("Project configuration loaded.")
logger.info(f"Dataset: {config['data']['dataset_name']}")
logger.info(f"Store directory: {config['data']['store_dir']}")

# Load dataset
df = load_or_fetch_dataset(config["data"]["dataset_name"], data, config["data"]["store_dir"])
logger.info(f"Dataset shape: {df.shape}")


2025-11-19 20:02:27,235 - jarvis_project - INFO - Project configuration loaded.
2025-11-19 20:02:27,236 - jarvis_project - INFO - Dataset: dft_3d
2025-11-19 20:02:27,236 - jarvis_project - INFO - Store directory: /shared/data/jarvis


Loading existing pickle file: /shared/data/jarvis/jarvis_dft_3d.pkl


2025-11-19 20:02:28,882 - jarvis_project - INFO - Dataset shape: (75993, 64)


Dataset shape: (75993, 64)


In [4]:
features = df.columns.tolist()
logger.info(f"Features: {features}")

2025-11-19 20:02:28,891 - jarvis_project - INFO - Features: ['jid', 'spg_number', 'spg_symbol', 'formula', 'formation_energy_peratom', 'func', 'optb88vdw_bandgap', 'atoms', 'slme', 'magmom_oszicar', 'spillage', 'elastic_tensor', 'effective_masses_300K', 'kpoint_length_unit', 'maxdiff_mesh', 'maxdiff_bz', 'encut', 'optb88vdw_total_energy', 'epsx', 'epsy', 'epsz', 'mepsx', 'mepsy', 'mepsz', 'modes', 'magmom_outcar', 'max_efg', 'avg_elec_mass', 'avg_hole_mass', 'icsd', 'dfpt_piezo_max_eij', 'dfpt_piezo_max_dij', 'dfpt_piezo_max_dielectric', 'dfpt_piezo_max_dielectric_electronic', 'dfpt_piezo_max_dielectric_ionic', 'max_ir_mode', 'min_ir_mode', 'n-Seebeck', 'p-Seebeck', 'n-powerfact', 'p-powerfact', 'ncond', 'pcond', 'nkappa', 'pkappa', 'ehull', 'Tc_supercon', 'dimensionality', 'efg', 'xml_data_link', 'typ', 'exfoliation_energy', 'spg', 'crys', 'density', 'poisson', 'raw_files', 'nat', 'bulk_modulus_kv', 'shear_modulus_gv', 'mbj_bandgap', 'hse_gap', 'reference', 'search']


## General Dataframe Exploration

see also: https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/Analyzing_data_in_the_JARVIS_DFT_dataset.ipynb


In [5]:
# Keys in the json file
print (df.columns, len(df.columns))

Index(['jid', 'spg_number', 'spg_symbol', 'formula',
       'formation_energy_peratom', 'func', 'optb88vdw_bandgap', 'atoms',
       'slme', 'magmom_oszicar', 'spillage', 'elastic_tensor',
       'effective_masses_300K', 'kpoint_length_unit', 'maxdiff_mesh',
       'maxdiff_bz', 'encut', 'optb88vdw_total_energy', 'epsx', 'epsy', 'epsz',
       'mepsx', 'mepsy', 'mepsz', 'modes', 'magmom_outcar', 'max_efg',
       'avg_elec_mass', 'avg_hole_mass', 'icsd', 'dfpt_piezo_max_eij',
       'dfpt_piezo_max_dij', 'dfpt_piezo_max_dielectric',
       'dfpt_piezo_max_dielectric_electronic',
       'dfpt_piezo_max_dielectric_ionic', 'max_ir_mode', 'min_ir_mode',
       'n-Seebeck', 'p-Seebeck', 'n-powerfact', 'p-powerfact', 'ncond',
       'pcond', 'nkappa', 'pkappa', 'ehull', 'Tc_supercon', 'dimensionality',
       'efg', 'xml_data_link', 'typ', 'exfoliation_energy', 'spg', 'crys',
       'density', 'poisson', 'raw_files', 'nat', 'bulk_modulus_kv',
       'shear_modulus_gv', 'mbj_bandgap', 'hse_ga

In [6]:
df.head()

Unnamed: 0,jid,spg_number,spg_symbol,formula,formation_energy_peratom,func,optb88vdw_bandgap,atoms,slme,magmom_oszicar,...,density,poisson,raw_files,nat,bulk_modulus_kv,shear_modulus_gv,mbj_bandgap,hse_gap,reference,search
0,JVASP-90856,129,P4/nmm,TiCuSiAs,-0.42762,OptB88vdW,0.0,"{'lattice_mat': [[3.566933224304235, 0.0, -0.0...",na,0.0,...,5.956,na,[],8,na,na,na,na,mp-1080455,-As-Cu-Si-Ti
1,JVASP-86097,221,Pm-3m,DyB6,-0.41596,OptB88vdW,0.0,"{'lattice_mat': [[4.089078911208881, 0.0, 0.0]...",na,0.0,...,5.522,na,"[OPT-LOPTICS,JVASP-86097.zip,https://ndownload...",7,na,na,na,na,mp-568319,-B-Dy
2,JVASP-64906,119,I-4m2,Be2OsRu,0.04847,OptB88vdW,0.0,"{'lattice_mat': [[-1.833590720595598, 1.833590...",na,0.0,...,10.96,na,"[OPT-LOPTICS,JVASP-64906.zip,https://ndownload...",4,na,na,na,na,auid-3eaf68dd483bf4f4,-Be-Os-Ru
3,JVASP-98225,14,P2_1/c,KBi,-0.4414,OptB88vdW,0.472,"{'lattice_mat': [[7.2963518353359165, 0.0, 0.0...",na,0.0,...,5.145,na,[],32,na,na,na,na,mp-31104,-Bi-K
4,JVASP-10,164,P-3m1,VSe2,-0.71026,OptB88vdW,0.0,"{'lattice_mat': [[1.6777483798834445, -2.90594...",na,0.0,...,5.718,0.23,"[FD-ELAST,JVASP-10.zip,https://ndownloader.fig...",3,48.79,33.05,0.0,na,mp-694,-Se-V


In [7]:
# Count number of entries for each property
for i in df.columns.values:
  val=df[i].replace('na',pd.NA).dropna().values
  print(i,len(val))

jid 75993
spg_number 75993
spg_symbol 75993
formula 75993
formation_energy_peratom 75993
func 75993
optb88vdw_bandgap 75993
atoms 75993
slme 9770
magmom_oszicar 71320
spillage 11377
elastic_tensor 25513
effective_masses_300K 75993
kpoint_length_unit 75671
maxdiff_mesh 5861
maxdiff_bz 5861
encut 75670
optb88vdw_total_energy 75993
epsx 52168
epsy 52168
epsz 52168
mepsx 18293
mepsy 18293
mepsz 18293
modes 13910
magmom_outcar 74261
max_efg 11871
avg_elec_mass 17645
avg_hole_mass 17645
icsd 75993
dfpt_piezo_max_eij 4799
dfpt_piezo_max_dij 3347
dfpt_piezo_max_dielectric 4706
dfpt_piezo_max_dielectric_electronic 4809
dfpt_piezo_max_dielectric_ionic 4809
max_ir_mode 4805
min_ir_mode 4809
n-Seebeck 23218
p-Seebeck 23218
n-powerfact 23218
p-powerfact 23218
ncond 23218
pcond 23218
nkappa 23218
pkappa 23218
ehull 75993
Tc_supercon 1058
dimensionality 75560
efg 75993
xml_data_link 75993
typ 75993
exfoliation_energy 813
spg 75993
crys 75993
density 75993
poisson 23597
raw_files 75993
nat 75993
bulk_

In [8]:
# Define known transparent semiconductor formulas
transparent_formulas = [
    "In2O3", "ZnO", "SnO2", "Ga2O3", "TiO2", "SrTiO3", "BaSnO3", "SrVO3",
    "Al2O3", "SiO2", "MgO", "GaN", "SiC"
]

# Search DataFrame for matches
df_transparent_known = df[df["formula"].isin(transparent_formulas)].copy()

logger.info(f"Known transparent semiconductors found: {df_transparent_known.shape[0]}")
#print(df_transparent_known[["jid", "formula", "optb88vdw_bandgap"]])
logger.info("\n%s", df_transparent_known[["jid", "formula", "optb88vdw_bandgap"]].to_string());
flush_logger(logger)

2025-11-19 20:02:29,624 - jarvis_project - INFO - Known transparent semiconductors found: 218
2025-11-19 20:02:29,631 - jarvis_project - INFO - 
                jid formula  optb88vdw_bandgap
247     JVASP-22644     SiC              2.273
539     JVASP-97862    SiO2              5.621
596     JVASP-49837   Al2O3              4.737
935     JVASP-97444    SiO2              5.205
1175       JVASP-32   Al2O3              6.430
1683    JVASP-42549    TiO2              2.665
2078    JVASP-98776    SiO2              5.765
2168    JVASP-98531    SiO2              5.692
2272    JVASP-98532    SiO2              5.497
2476    JVASP-90247    TiO2              0.951
2906    JVASP-97419    SnO2              0.964
3031    JVASP-97332    SiO2              5.644
3212    JVASP-97422    SiO2              5.630
3367    JVASP-97720    SiO2              0.000
3564    JVASP-85600   Al2O3              1.213
4124    JVASP-86325  BaSnO3              0.578
4175    JVASP-88050    SiO2              5.603
4366    J

## Baseline Model

In [9]:


# --- Utilities ---

def coerce_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df


def drop_empty_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop columns that are entirely NaN if numeric. Keep object columns (categorical)
    and any column with at least one non-missing numeric value.
    """
    keep = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].notna().any():
                keep.append(col)
        else:
            # keep non-numeric only if categorical (object)
            if df[col].dtype == "object":
                keep.append(col)
    return df[keep]

def flatten_dict_column(df, col, prefix):
    """
    Expand a dict-like column into separate numeric columns.
    """
    expanded = df[col].apply(lambda d: d if isinstance(d, dict) else {})
    expanded_df = pd.json_normalize(expanded)
    expanded_df = expanded_df.add_prefix(f"{prefix}_")
    return pd.concat([df.drop(columns=[col]), expanded_df], axis=1)

def safe_list_stats(lst):
    """Compute mean/std/min/max for a list, ignoring non-numeric entries."""
    if not isinstance(lst, (list, tuple)) or len(lst) == 0:
        return (np.nan, np.nan, np.nan, np.nan)
    nums = []
    for x in lst:
        if isinstance(x, (int, float, np.number)):
            nums.append(float(x))
        else:
            try:
                nums.append(float(x))
            except (ValueError, TypeError):
                continue
    if len(nums) == 0:
        return (np.nan, np.nan, np.nan, np.nan)
    return (np.mean(nums), np.std(nums), np.min(nums), np.max(nums))

def flatten_list_column(df, col, prefix):
    """
    Flatten a list-like column into numeric summary stats using safe_list_stats.
    """
    stats = df[col].apply(safe_list_stats)
    df[f"{prefix}_mean"] = stats.apply(lambda t: t[0])
    df[f"{prefix}_std"]  = stats.apply(lambda t: t[1])
    df[f"{prefix}_min"]  = stats.apply(lambda t: t[2])
    df[f"{prefix}_max"]  = stats.apply(lambda t: t[3])
    return df.drop(columns=[col])

def sanitize_features(df: pd.DataFrame, drop_cols=None) -> pd.DataFrame:
    """
    Two-pass sanitizer:
    1) Flatten dict-like columns across the entire frame.
    2) Flatten list-like columns across the entire frame.
    Drop structural columns listed in drop_cols first.
    """
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")

    # Pass 1: flatten dicts everywhere
    for col in list(df.columns):
        if df[col].apply(lambda v: isinstance(v, dict)).any():
            df = flatten_dict_column(df, col, col)

    # Pass 2: flatten lists everywhere (after dict flattening)
    for col in list(df.columns):
        if df[col].apply(lambda v: isinstance(v, (list, tuple))).any():
            df = flatten_list_column(df, col, col)

    return df

def drop_residual_non_scalars(df: pd.DataFrame) -> pd.DataFrame:
    """
    Final sweep: drop any columns that still contain lists or dicts.
    """
    to_drop = []
    for col in df.columns:
        if df[col].apply(lambda v: isinstance(v, (list, tuple, dict))).any():
            to_drop.append(col)
    return df.drop(columns=to_drop, errors="ignore")

# --- Feature builder and modeling ---

def build_features(df: pd.DataFrame, config: dict) -> pd.DataFrame:
    df = df.copy()

    # Transparency label from config threshold
    bandgap_col = config["filters"]["bandgap_column"]
    trans_min   = config["filters"]["transparent_min"]
    df["is_transparent"] = pd.to_numeric(df[bandgap_col], errors="coerce") > trans_min

    # Dielectric features
    for c in ["epsx", "epsy", "epsz"]:
        if c not in df.columns:
            df[c] = np.nan
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    df["eps_mean"] = df[["epsx","epsy","epsz"]].mean(axis=1, skipna=True)
    df["eps_std"]  = df[["epsx","epsy","epsz"]].std(axis=1, skipna=True)

    # Flatten known dict-like
    if "effective_masses_297K" in df.columns:
        df["electron_mass"] = df["effective_masses_297K"].apply(
            lambda d: d.get("electron_mass") if isinstance(d, dict) else np.nan
        )
        df["hole_mass"] = df["effective_masses_297K"].apply(
            lambda d: d.get("hole_mass") if isinstance(d, dict) else np.nan
        )
        df = df.drop(columns=["effective_masses_297K"])

    # Flatten known list-like
    for col in ["effective_masses_300K_p", "effective_masses_300K_n"]:
        if col in df.columns:
            df = flatten_list_column(df, col, col)

    # Possible dict wrappers for averages
    for col in ["avg_elec_mass", "avg_hole_mass"]:
        if col in df.columns:
            df[col] = df[col].apply(lambda d: d.get("value") if isinstance(d, dict) else d)
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Bag-of-elements
    if config["features"].get("bag_of_elements", True):
        element_set = ["O","N","F","Al","Ga","In","Zn","Sn","Mg","Ti","Si","C"]
        for el in element_set:
            df[f"has_{el}"] = df["formula"].str.contains(rf"\b{el}\b", regex=True, na=False).astype(int)

    # Sanitize broadly, drop structural artifacts
    df = sanitize_features(df, drop_cols=["atoms", "elastic_tensor", "raw_files"])

    # Final sweep to remove any residual non-scalars
    df = drop_residual_non_scalars(df)

    # Drop empty numeric features
    df = drop_empty_features(df)

    return df

def split_train_val_known(df: pd.DataFrame, config: dict):
    known_list = set(config["known"]["transparent_formulas"])
    in_known = df["formula"].isin(known_list)
    df_known = df[in_known].copy()
    df_main  = df[~in_known].copy()

    X = df_main.drop(columns=config["ml"]["exclude_columns"])
    y = df_main[config["ml"]["target_column"]].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    return (X_train, X_val, y_train, y_val), df_known


In [10]:
def build_baseline_pipeline(X_train: pd.DataFrame, config: dict):
    # Explicitly separate numeric vs categorical
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

    numeric = Pipeline(steps=[
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])
    categorical = Pipeline(steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preproc = ColumnTransformer(
        transformers=[
            ("num", numeric, numeric_cols),
            ("cat", categorical, cat_cols)
        ],
        remainder="drop"
    )

    model_choice = config["ml"].get("model", "logreg")
    if model_choice == "logreg":
        clf = LogisticRegression(max_iter=500, class_weight="balanced")
    else:
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)

    pipeline = Pipeline(steps=[("preproc", preproc), ("clf", clf)])
    return pipeline

In [11]:

# --- Usage & testing ---

df_feat = build_features(df, config)

# Coerce known numeric-like columns to numeric
numeric_like = ["spg_number", "encut", "poisson", "bulk_modulus_kv", "shear_modulus_gv", "exfoliation_energy"]
df_feat = coerce_numeric(df_feat, numeric_like)

# Diagnostics: ensure no lists/dicts remain
exclude = config["ml"]["exclude_columns"]
target = config["ml"]["target_column"]
cols_to_check = [c for c in df_feat.columns if c not in exclude and c != target]

residual_lists = [c for c in cols_to_check if df_feat[c].apply(lambda v: isinstance(v, (list, tuple))).any()]
residual_dicts = [c for c in cols_to_check if df_feat[c].apply(lambda v: isinstance(v, dict)).any()]
logger.info(f"Residual list columns: {residual_lists}")
logger.info(f"Residual dict columns: {residual_dicts}")
assert len(residual_lists) == 0 and len(residual_dicts) == 0, "Non-scalar columns remain after sanitization."

# Split train/val and known holdout
(X_train, X_val, y_train, y_val), df_known = split_train_val_known(df_feat, config)

# Force categorical columns to string type
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
for c in cat_cols:
    X_train[c] = X_train[c].astype(str)
    X_val[c] = X_val[c].astype(str)
    if c in df_known.columns:
        df_known[c] = df_known[c].astype(str)

# Drop High‑Cardinality Categorical Columns from all sets
high_cardinality = ["xml_data_link", "reference", "search", "icsd"]
for dataset_name, dataset in [("train", X_train), ("val", X_val), ("known", df_known)]:
    drop_cols = [c for c in high_cardinality if c in dataset.columns]
    if drop_cols:
        logger.info(f"Dropping high-cardinality identifier columns from {dataset_name}: {drop_cols}")
        dataset.drop(columns=drop_cols, inplace=True, errors="ignore")

# Diagnostics: show numeric vs categorical split
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols     = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
logger.info(f"Numeric feature columns: {numeric_cols}")
logger.info(f"Categorical feature columns: {cat_cols}")

# High‑Cardinality Categorical Features → frequency encoding
for col in cat_cols:
    if col in X_train.columns and X_train[col].nunique() > 50:
        logger.info(f"Applying frequency encoding to high-cardinality column: {col}")
        freqs = X_train[col].value_counts(normalize=True)
        X_train[col] = X_train[col].map(freqs)
        X_val[col]   = X_val[col].map(freqs)
        if col in df_known.columns:
            df_known[col] = df_known[col].map(freqs)

# Build pipeline with categorical encoding
pipe = build_baseline_pipeline(X_train, config)


2025-11-19 20:02:38,928 - jarvis_project - INFO - Residual list columns: []
2025-11-19 20:02:38,930 - jarvis_project - INFO - Residual dict columns: []
2025-11-19 20:02:39,718 - jarvis_project - INFO - Dropping high-cardinality identifier columns from train: ['xml_data_link', 'reference', 'search', 'icsd']
2025-11-19 20:02:39,807 - jarvis_project - INFO - Dropping high-cardinality identifier columns from val: ['xml_data_link', 'reference', 'search', 'icsd']
2025-11-19 20:02:39,833 - jarvis_project - INFO - Dropping high-cardinality identifier columns from known: ['xml_data_link', 'reference', 'search', 'icsd']
2025-11-19 20:02:39,987 - jarvis_project - INFO - Numeric feature columns: ['spg_number', 'formation_energy_peratom', 'encut', 'optb88vdw_total_energy', 'epsx', 'epsy', 'epsz', 'avg_elec_mass', 'avg_hole_mass', 'ehull', 'exfoliation_energy', 'density', 'poisson', 'nat', 'bulk_modulus_kv', 'shear_modulus_gv', 'eps_mean', 'eps_std', 'has_O', 'has_N', 'has_F', 'has_Al', 'has_Ga', 'h

In [12]:
for col in X_train.columns:
    if X_train[col].apply(lambda v: isinstance(v, list)).any():
        print("Lists found in:", col)

In [13]:
# Fit and evaluate
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
print("\nValidation performance:")
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

# External evaluation on known transparent formulas
X_known = df_known.drop(columns=config["ml"]["exclude_columns"])
y_known = df_known["is_transparent"].astype(int)
y_known_pred = pipe.predict(X_known)

print("\nExternal evaluation (known formulas):")
print(classification_report(y_known, y_known_pred))
print(confusion_matrix(y_known, y_known_pred))


Validation performance:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94     13995
           1       0.40      0.95      0.56      1160

    accuracy                           0.89     15155
   macro avg       0.70      0.92      0.75     15155
weighted avg       0.95      0.89      0.91     15155

[[12360  1635]
 [   60  1100]]

External evaluation (known formulas):
              precision    recall  f1-score   support

           0       0.98      0.46      0.62       118
           1       0.61      0.99      0.75       100

    accuracy                           0.70       218
   macro avg       0.79      0.72      0.69       218
weighted avg       0.81      0.70      0.68       218

[[54 64]
 [ 1 99]]
