# Prepare the data for run the inferences in NOMAD

In [4]:
import pandas as pd
from tf_chpvk_pv.config import PROCESSED_DATA_DIR

df = pd.read_csv(PROCESSED_DATA_DIR / 'stable_compositions.csv')
df.rename(columns={'Unnamed: 0': 'composition'}, inplace=True)
df['num_formula_units_per_cell'] = 4
df['space_group'] = ''

final_df = df[['composition', 'num_formula_units_per_cell', 'space_group']]


final_df.to_csv(PROCESSED_DATA_DIR / 'stable_compositions_for_CrystaLLM.csv', index=False)

[32m2026-02-12 11:53:04.696[0m | [1mINFO    [0m | [36mtf_chpvk_pv.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/dagar/TF-ChPVK-PV[0m


Then this the file 'stable_compositions_for_CrystaLLM.csv' can be uploaded in [nomad](https://nomad-lab.eu/prod/v1/oasis/gui/user/uploads/upload/id/rAGhkvDaTgyQPb_k3NcFbg)

# Analyze the inferences made in NOMAD

In [5]:
import json
import os
from tf_chpvk_pv.config import CRYSTALLM_DATA_DIR

# Replace with the path to your directory containing JSON files
directory_path = CRYSTALLM_DATA_DIR / 'json_files'

json_data = {}

for filename in os.listdir(directory_path):
    if filename.endswith('.json'):
        filepath = os.path.join(directory_path, filename)
        with open(filepath, 'r') as f:
            try:
                data = json.load(f)
                json_data[ filename + '_' + filename] = data
            except json.JSONDecodeError:
                print(f"Error decoding JSON from file: {filename}")

In [6]:
import pandas as pd
from tf_chpvk_pv.config import CRYSTALLM_DATA_DIR

results = pd.DataFrame(index=range(0, len(json_data.keys())),
                       columns=['material', 'atoms',
                                'a', 'b', 'c', 'alpha', 'beta', 'gamma',
                                'volume', 'atomic_density', 'mass_density'])
for idx, key in enumerate(json_data.keys()):
  try:
    chemical_formula_iupac = json_data[key]['archive']['results']['material']['topology'][0]['chemical_formula_iupac']
  except:
    print(key)
    print(json_data[key]['archive'])
    continue
  atoms_data = json_data[key]['archive']['results']['material']['topology'][0]['atoms']
  # Convert the 'atoms' dictionary to a string representation
  atoms_string = str(atoms_data)
  results.loc[idx, 'material'] = chemical_formula_iupac
  results.loc[idx, 'atoms'] = atoms_string
  for col in json_data[key]['archive']['results']['material']['topology'][0]['cell']:
    if col in ['a', 'b', 'c']:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col] * 10**10 #Amstrongs
    elif col in ['volume']:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col] * 10**30 #Amstrongs cubed
    elif col in ['atomic_density']:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col] * 10**-30 #1/Amstrons cubed
    elif col in ['alpha', 'beta', 'gamma']:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col] * 180 / 3.141592653589793 #degrees
    elif col in ['mass_density']:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col] / 1000 #g/cm3
    else:
      results.loc[idx, col] = json_data[key]['archive']['results']['material']['topology'][0]['cell'][col]


results.to_csv(CRYSTALLM_DATA_DIR / 'results CrystaLLM.csv')

In [7]:
results.head()

Unnamed: 0,material,atoms,a,b,c,alpha,beta,gamma,volume,atomic_density,mass_density
0,TbZnS3,{'m_def': 'nomad.datamodel.metainfo.system.Ato...,7.1784,8.7559,6.3013,90.0,90.0,90.0,396.05783,0.050498,5.375006
1,SrUSe3,{'m_def': 'nomad.datamodel.metainfo.system.Ato...,8.0312,10.4242,7.0155,90.0,90.0,90.0,587.329487,0.034052,6.361685
2,UCeSe3,{'m_def': 'nomad.datamodel.metainfo.system.Ato...,8.3754,10.4925,6.2499,90.0,90.0,90.0,549.23424,0.036414,7.437795
3,AlSnSe3,{'m_def': 'nomad.datamodel.metainfo.system.Ato...,6.9309,10.3087,7.2282,90.0,90.0,90.0,516.444545,0.038726,4.920373
4,TbEuS3,{'m_def': 'nomad.datamodel.metainfo.system.Ato...,7.6155,9.2152,6.4995,90.0,90.0,90.0,456.124222,0.043848,5.92803


# Validation with ICSD data

In [None]:
import os
from pymatgen.core import Structure
import pandas as pd

ICSD_files = '/home/dagar/TF-ChPVK-PV/data/crystaLLM/ICSD_files'

icsd_data = []
for filename in os.listdir(ICSD_files):
    if filename.endswith('.cif'):
        filepath = os.path.join(ICSD_files, filename)
        try:
            struct = Structure.from_file(filepath)
            icsd_data.append({
                'filename': filename,
                'formula': struct.composition.reduced_formula,
                'formula_pretty': struct.composition.alphabetical_formula,
                'a': struct.lattice.a,
                'b': struct.lattice.b,
                'c': struct.lattice.c,
                'alpha': struct.lattice.alpha,
                'beta': struct.lattice.beta,
                'gamma': struct.lattice.gamma,
                'volume': struct.volume,
                'space_group': struct.get_space_group_info()[0],
                'num_sites': len(struct)
            })
        except Exception as e:
            print(f"Error reading {filename}: {e}")

icsd_df = pd.DataFrame(icsd_data)
icsd_df.drop(columns=['filename']).to_csv(CRYSTALLM_DATA_DIR / 'icsd_ref_data.csv', index=False)

In [22]:
from pymatgen.core import Structure
from pymatgen.analysis.structure_matcher import StructureMatcher


cif_files_dict = CRYSTALLM_DATA_DIR / 'cif_files'

# More permissive matcher settings
matcher = StructureMatcher(
    ltol=0.3,              # lattice tolerance (30%)
    stol=0.5,              # site tolerance (fractional coords)
    angle_tol=10,          # angle tolerance in degrees
    primitive_cell=True,   # reduce both to primitive cell before comparing
    scale=True,            # allow volume scaling
    attempt_supercell=True # allow matching different cell sizes
)

for idx, formula in enumerate(icsd_df['formula']):
    cif_file = cif_files_dict / f"{formula}_1.cif"
    cif_file_ref = ICSD_files + "/" + icsd_df.loc[idx, 'filename']

    struct1 = Structure.from_file(cif_file)
    struct2 = Structure.from_file(cif_file_ref)

    # Compare structures
    match = matcher.fit(struct1, struct2)  # Returns True if similar
    rms = matcher.get_rms_dist(struct1, struct2)  # RMS distance
    
    # Get space groups
    sg1 = struct1.get_space_group_info()[0]
    sg2 = struct2.get_space_group_info()[0]
    
    # Calculate percent differences for lattice params
    delta_a = abs(struct1.lattice.a - struct2.lattice.a) / struct2.lattice.a * 100
    delta_b = abs(struct1.lattice.b - struct2.lattice.b) / struct2.lattice.b * 100
    delta_c = abs(struct1.lattice.c - struct2.lattice.c) / struct2.lattice.c * 100
    delta_v = abs(struct1.volume - struct2.volume) / struct2.volume * 100
    
    # Print lattice parameters
    print(f"Comparing {formula}:")
    print(f"  CrystaLLM: a={struct1.lattice.a:.4f}, b={struct1.lattice.b:.4f}, c={struct1.lattice.c:.4f}")
    print(f"            alpha={struct1.lattice.alpha:.2f}, beta={struct1.lattice.beta:.2f}, gamma={struct1.lattice.gamma:.2f}")
    print(f"            Space group: {sg1}, Num sites: {len(struct1)}")
    print(f"  ICSD:      a={struct2.lattice.a:.4f}, b={struct2.lattice.b:.4f}, c={struct2.lattice.c:.4f}")
    print(f"            alpha={struct2.lattice.alpha:.2f}, beta={struct2.lattice.beta:.2f}, gamma={struct2.lattice.gamma:.2f}")
    print(f"            Space group: {sg2}, Num sites: {len(struct2)}")
    print(f"  Δa={delta_a:.1f}%, Δb={delta_b:.1f}%, Δc={delta_c:.1f}%, ΔV={delta_v:.1f}%")
    print(f"  Match={match}, RMS Distance={rms}")
    print()

Comparing PrScS3:
  CrystaLLM: a=7.2313, b=9.6576, c=6.5144
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pnma, Num sites: 20
  ICSD:      a=7.1100, b=6.4900, c=9.5300
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pna2_1, Num sites: 20
  Δa=1.7%, Δb=48.8%, Δc=31.6%, ΔV=3.5%
  Match=False, RMS Distance=None

Comparing CeScS3:
  CrystaLLM: a=7.2673, b=9.5785, c=6.5061
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pnma, Num sites: 20
  ICSD:      a=7.1618, b=9.5621, c=6.5065
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pnma, Num sites: 20
  Δa=1.5%, Δb=0.2%, Δc=0.0%, ΔV=1.6%
  Match=False, RMS Distance=None

Comparing GdScS3:
  CrystaLLM: a=6.9914, b=6.3600, c=9.5392
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pna2_1, Num sites: 20
  ICSD:      a=7.0361, b=9.4574, c=6.3833
            alpha=90.00, beta=90.00, gamma=90.00
            Space group: Pn