Relevant literature

- https://www.researchsquare.com/article/rs-3553470/v1
- https://www.nature.com/articles/s41467-020-18661-9#MOESM1

Data sets

- https://www.sciencebase.gov/catalog/item/6798fd34d34ea8c18376e8ee
- https://datacatalog.worldbank.org/search/dataset/0037651
- https://en.wikipedia.org/wiki/Abundance_of_elements_in_Earth%27s_crust

In [1]:
# Dataset from https://www.sciencebase.gov/catalog/item/6798fd34d34ea8c18376e8ee

from tf_chpvk_pv.config import SUSTAINABILITY_DATA_DIR
import pandas as pd
import numpy as np
import os


year = 2023

year_name = {2023: 'PROD_2023',
             2024: 'PROD_EST_ 2024'}

columns = ['COUNTRY', 'COMMODITY', year_name[year]]

df = pd.read_csv(SUSTAINABILITY_DATA_DIR / 'MCS2025_World_Data.csv')

df_no_world = df[~df['COUNTRY'].str.contains('World', na=False)].copy()

df_max_prod = df_no_world[columns].groupby(['COMMODITY']).sum()[year_name[year]]

df_no_world['share_squared'] = (df_no_world[year_name[year]] / df_no_world['COMMODITY'].map(df_max_prod)) ** 2

hhi = df_no_world[columns + ['share_squared']].groupby(['COMMODITY']).sum().reset_index()[['COMMODITY', 'share_squared']]

hhi.rename(columns={'share_squared': 'HHI'}, inplace=True)

[32m2025-12-21 16:17:33.734[0m | [1mINFO    [0m | [36mtf_chpvk_pv.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/dagar/TF-ChPVK-PV[0m


In [2]:
import json

filename_mapping_element = SUSTAINABILITY_DATA_DIR / 'mapping' / 'commodity_to_element.json'
filename_mapping_country = SUSTAINABILITY_DATA_DIR / 'mapping' / 'country_codes.json'

with open(filename_mapping_element, 'r') as f:
    commodity_to_element = json.load(f)

hhi['Element'] = hhi['COMMODITY'].map(commodity_to_element)

with open(filename_mapping_country, 'r') as f:
    country_codes = json.load(f)

df_no_world['Country_Code'] = df_no_world['COUNTRY'].map(country_codes)


In [3]:
import re

#Dataset from https://datacatalog.worldbank.org/search/dataset/0037651 Year: 2023

df_esg = pd.read_csv(SUSTAINABILITY_DATA_DIR / 'ESG_World_Data_2023.csv')

# Rename columns in df_esg to the text between square brackets, if present
def extract_bracket(col):
    match = re.search(r'\[(.*?)\]', col)
    return match.group(1) if match else col

df_esg.columns = [extract_bracket(col) for col in df_esg.columns]

from scipy.stats import zscore

positive_indicators = [
    'NV.AGR.TOTL.ZS',
    'ER.PTD.TOTL.ZS',
    'IT.NET.USER.ZS',
    'SL.TLF.ACTI.ZS',
    'SP.DYN.LE00.IN',
    'SG.GEN.PARL.ZS',
    'SE.PRM.ENRR',
    'CC.EST',
    'GE.EST',
    'PV.EST',
    'RQ.EST',
    'RL.EST',
    'VA.EST'
] #lower-risk

negative_indicators = [
    'EG.ELC.COAL.ZS',
    'EG.IMP.CONS.ZS',
    'EG.USE.PCAP.KG.OE',
    'EG.USE.COMM.FO.ZS',
    'SL.UEM.TOTL.ZS'
] #higher-risk

def normalize_min_max(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

for col in df_esg.columns[1:]:
    if col not in ['Country Name', 'Country Code', 'Time Code']:
        
        df_esg[col] = pd.to_numeric(df_esg[col], errors='coerce')

        #z-score normalization
        df_esg[col] = pd.Series(zscore(df_esg[col].values, nan_policy='omit'),
                                index=df_esg.index)

    if col in positive_indicators:
        df_esg[col] = -df_esg[col] # Invert scores for positive indicators to ensure higher values indicate higher risk

# Optional: pillar-specific scores
df_esg['E_score'] = df_esg[['NV.AGR.TOTL.ZS','ER.PTD.TOTL.ZS',
                            'EG.ELC.COAL.ZS','EG.IMP.CONS.ZS',
                            'EG.USE.PCAP.KG.OE','EG.USE.COMM.FO.ZS']].mean(axis=1)

df_esg['S_score'] = df_esg[['IT.NET.USER.ZS','SL.TLF.ACTI.ZS','SP.DYN.LE00.IN',
                            'SG.GEN.PARL.ZS','SE.PRM.ENRR','SL.UEM.TOTL.ZS']].mean(axis=1)

df_esg['G_score'] = df_esg[['CC.EST','GE.EST','PV.EST','RQ.EST','RL.EST','VA.EST']].mean(axis=1)

for col in ['E_score', 'S_score', 'G_score']:
    df_esg[col] = df_esg[col].transform(lambda x: normalize_min_max(x))

df_esg['ESG_score'] = df_esg[['E_score', 'S_score', 'G_score']].mean(axis=1)
country_ESG_scores = df_esg.set_index('Country Code')['ESG_score'].to_dict()

In [4]:
df_no_world['ESG_score'] = df_no_world['Country_Code'].map(country_ESG_scores)

In [5]:
hhi['SR'] = df_no_world.groupby('COMMODITY').apply(lambda x: np.sum(x['share_squared'] * x['ESG_score'])).reset_index(name='SR')['SR']

In [6]:
# Earth abundance data from https://en.wikipedia.org/wiki/Abundances_of_the_elements_(data_page)

df_earth = pd.read_csv(SUSTAINABILITY_DATA_DIR / 'earth_abundance_data.csv')
df_earth['Abundance (ppm)'] = pd.to_numeric(df_earth['Abundance (ppm)'], errors='coerce')

def normalize_min_max(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

df_earth['Silicio_normalized'] = df_earth['Abundance (ppm)'] / df_earth.loc[df_earth['Symbol'] == 'Si', 'Abundance (ppm)'].values[0]

df_earth['normalized_abundance_risk'] = 1 - normalize_min_max(df_earth['Silicio_normalized'])

element_abundance = df_earth.set_index('Symbol')['normalized_abundance_risk'].to_dict()

In [7]:
def contains_element(val, target):
    if isinstance(val, list):
        return target in val
    elif isinstance(val, str):
        return target == val
    return False


def find_hhi_score(formula, hhi=hhi, element_abundance=element_abundance):

    elements = re.findall(r'[A-Z][a-z]?', formula)

    warn = ''

    hhi_copy = hhi.copy()
    for element in elements:
        hhi_copy['has_' + element] = hhi_copy['Element'].apply(lambda x: contains_element(x, element))
    
    hhi_copy['has_element'] = hhi_copy[[col for col in hhi_copy.columns if col.startswith('has_')]].sum(axis=1)

    if hhi_copy['has_element'].sum() == 0:
        return None, None, None
    elif hhi_copy['has_element'].sum() < len(elements):
        print(f"Warning: Not all elements found in HHI data for the compound {formula}, the elements found are:")
        print(hhi_copy[hhi_copy['has_element'] != 0]['Element'].values)

        warn = f"Warning: Not all elements found in HHI data for the compound {formula}, the elements found are: {hhi_copy[hhi_copy['has_element'] != 0]['Element'].values}"


    hhi_value = (hhi_copy['HHI'] * hhi_copy['has_element']).sum()
    sr_value = (hhi_copy['SR'] * hhi_copy['has_element']).sum()

    #calculate abundance risk as the sum of normalized abundance risks of the elements in the formula
    ar_value = sum([element_abundance.get(element, 0) for element in elements])/len(elements)

    return [hhi_value, sr_value, ar_value, warn]

In [8]:
# Try on CrystaLLM results
from tf_chpvk_pv.config import CRYSTALLM_DATA_DIR, PROCESSED_DATA_DIR

df_results = pd.read_csv(CRYSTALLM_DATA_DIR / 'results CrystaLLM.csv')
df_results[['HHI', 'SR', 'AR', 'Warning']] = df_results['material'].apply(lambda x: pd.Series(find_hhi_score(x)))

df_results[['material', 'HHI', 'SR', 'AR', 'Warning']].to_csv(PROCESSED_DATA_DIR / 'results_CrystaLLM_with_HHI.csv', index=False)

['Se' 'Sr']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'Se']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'Se']
['Ba' 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
['Ba' 'Se']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
['In' 'S']
['Bi' 'S']
['S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'Se']
['In' 'Se']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', '

In [9]:
# Try on SISSO results
from tf_chpvk_pv.config import PROCESSED_DATA_DIR

df_results = pd.read_csv(PROCESSED_DATA_DIR / 'stable_compositions.csv')
df_results.rename(columns={'Unnamed: 0': 'material'}, inplace=True)
df_results[['HHI', 'SR', 'AR', 'Warning']] = df_results['material'].apply(lambda x: pd.Series(find_hhi_score(x)))

df_results[['material', 'HHI', 'SR', 'AR', 'Warning']].to_csv(PROCESSED_DATA_DIR / 'results_SISSO_with_HHI.csv', index=False)

['Ba' 'S']
['Ba' 'Se']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'Se']
['Sr' 'S']
['Se' 'Sr']
['Cd' 'Se']
['Cu' 'Se']
['Cu' 'Se']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Y'])
 'S']
[list(['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'E