**Artificial Intelligence-Aided Analysis of Hydrogen-Based Monoclinic Structures and Modeling of Structure-Property Relationships**

**4#Composition Based Feature Engineering**

Göktuğ USTA & Sedef KORKMAZ | Izmir Democracy University - Electrical and Electronics Engineering Department - 2025

goktugustaa@gmail.com 

sedefkorkmaz67@hotmail.com

Data retrieved from The Materials Project API (https://next-gen.materialsproject.org/).

In [1]:
from pymatgen.core import Composition, Element
import numpy as np

In [2]:
def derive_atomic_fingerprints(formula: str) -> dict:
    try:
        comp= Composition(formula)
        atomic_masses = []
        atomic_radii = []
        electronegativities = []
        valence_electrons = []

        for element, amount in comp.get_el_amt_dict().items():
            el = Element(element)
            weight = amount / comp.num_atoms

            atomic_masses.append(el.atomic_mass*weight)
            atomic_radii.append(float(el.atomic_radius or 0) * weight)
            electronegativities.append(el.X * weight)
            valence_electrons.append(float(el.group) * weight)

        return{
            "avg_atomic_mass": sum(atomic_masses),
            "avg_atomic_radius": sum(atomic_radii),
            "avg_electronegativity": sum(electronegativities),
            "avg_valence_electrons": sum(valence_electrons)
        }
    except Exception as e:
        print(f"Error {formula}:{e}")
        return None

print("Ready.")


Ready.


In [3]:
import pandas as pd

data = pd.read_csv("monoclinic_hydrogen_data.csv")
df = pd.DataFrame(data)
df.head()

Unnamed: 0,material_id,formula,elements,volume,density,band_gap,magnetization,lattice_a,lattice_b,lattice_c
0,mp-995200,HC3,"[Element C, Element H]",73.158575,1.681455,0.0,1.2e-05,2.463933,3.663947,8.107367
1,mp-1217971,Ta2H,"[Element H, Element Ta]",38.193485,15.777973,0.0,0.0,2.919064,2.919064,4.883885
2,mp-642644,V2H,"[Element H, Element V]",28.577852,5.978561,0.0,5e-06,2.656866,2.656866,4.428967
3,mp-995184,HC2,"[Element C, Element H]",58.118465,1.430258,0.0,1.4e-05,6.33287,6.33287,3.813987
4,mp-995197,HC,"[Element C, Element H]",148.834584,1.161986,3.5572,0.001015,6.10339,6.10339,4.861911


In [6]:
features_list = []

for formula in df["formula"]:
    feature = derive_atomic_fingerprints(formula)
    features_list.append(feature)

df_features =pd.DataFrame(features_list)

X_new = df_features
y_new = df["band_gap"]

print("Train Test Ready.")
print(X_new.head())

Train Test Ready.
   avg_atomic_mass  avg_atomic_radius  avg_electronegativity  \
0         9.260010           0.587500               2.462500   
1       120.967900           1.050000               1.733333   
2        34.296980           0.983333               1.820000   
3         8.343113           0.550000               2.433333   
4         6.509320           0.475000               2.375000   

   avg_valence_electrons  
0              10.750000  
1               3.666667  
2               3.666667  
3               9.666667  
4               7.500000  


In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [8]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

scaler_new = StandardScaler()
X_train_scaled_new = scaler_new.fit_transform(X_train_new)
X_test_scaled_new = scaler_new.transform(X_test_new)

xgb_bandgap_estimator = XGBRegressor(n_estimators= 200, learning_rate=0.05, max_depth=5, random_state= 42)
xgb_bandgap_estimator.fit(X_train_scaled_new, y_train_new)

y_predict_new = xgb_bandgap_estimator.predict(X_test_scaled_new)
print(f"New model: {r2_score(y_test_new, y_predict_new):.4f}")

New model: 0.2405


In [9]:
def material_informatics_engine():
    print("Agentic Panel")
    formula_input = input("Enter a formula: ").strip()
    feats = derive_atomic_fingerprints(formula_input)

    if feats:
        input_vector = pd.DataFrame([feats])
        input_scaled = scaler_new.transform(input_vector)
        predicted_bg = xgb_bandgap_estimator.predict(input_scaled)[0]

        print("-" * 40)
        print(f"Formula Input: {formula_input}\nAtomic Mass: {feats["avg_atomic_mass"]}\n Radius: {feats["avg_atomic_radius"]}")
        print(f"Predicted Band Gap: {predicted_bg}")

        if predicted_bg < 0.05:
            print("Metal (Conductor)")
        elif predicted_bg < 2.0:
            print("Semi Conductor")
        else:
            print("Insulator")

    else:
        print("Invalid formula. Please try again")

        

In [11]:
material_informatics_engine()

Agentic Panel
----------------------------------------
Formula Input: NH3
Atomic Mass: 4.25763
 Radius: 0.35
Predicted Band Gap: 5.2018232345581055
Insulator
