In [21]:
import pandas as pd
import ast

# 1. Load the CSV (with mixed-type handling)
file_path = '../../all_materials_elastic.csv'

try:
    df = pd.read_csv(
        file_path,
        low_memory=False,
        na_values=['NA', 'N/A', '--', '-', ''],
        encoding='utf-8'
    )
    print("✅ File loaded successfully!")
except Exception as e:
    print(f"❌ Error loading file: {e}")
    exit()

# 2. Helper function: Safely parse a tuple from string
def try_parse_tuple(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return x

# 3. Step: Parse each cell into a tuple if possible
for col in df.columns:
    df[col] = df[col].apply(try_parse_tuple)

# 4. Step: Create new column names from the first element of the tuple
new_columns = {}
for col in df.columns:
    first_value = df[col].dropna().iloc[0]
    if isinstance(first_value, tuple) and len(first_value) == 2:
        new_col_name = first_value[0]  # First part of the tuple
    else:
        new_col_name = col
    new_columns[col] = new_col_name

df = df.rename(columns=new_columns)

# 5. Step: Replace each cell with just the second value
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[1] if isinstance(x, tuple) and len(x) == 2 else x)
    
output_path = 'cleaned_elastic_materials.csv'
df.to_csv(output_path, index=False)
print(f"\n💾 Cleaned data saved to '{output_path}'")

# check data structure
df.head(2)

✅ File loaded successfully!

💾 Cleaned data saved to 'cleaned_elastic_materials.csv'


Unnamed: 0,0,nsites,2,nelements,4,5,formula_pretty,formula_anonymous,chemsys,volume,...,26,27,young_modulus,universal_anisotropy,homogeneous_poisson,debye_temperature,32,fitting_method,34,fields_not_requested
0,"('builder_meta', EmmetMeta(emmet_version='0.84...",8,"('elements', [Element Si])",1,"('composition', Composition('Si8'))","('composition_reduced', Composition('Si1'))",Si,A,Si,168.508435,...,"('sound_velocity', SoundVelocity(transverse=35...","('thermal_conductivity', ThermalConductivity(c...",,2.66,0.318,429.566703,"('fitting_data', FittingData(deformations=[((1...",finite_difference,"('state', <Status.SUCCESS: 'successful'>)",[]
1,"('builder_meta', EmmetMeta(emmet_version='0.84...",8,"('elements', [Element Si])",1,"('composition', Composition('Si8'))","('composition_reduced', Composition('Si1'))",Si,A,Si,169.205562,...,"('sound_velocity', SoundVelocity(transverse=30...","('thermal_conductivity', ThermalConductivity(c...",,11.527,0.318,365.582737,"('fitting_data', FittingData(deformations=[((1...",finite_difference,"('state', <Status.FAILED: 'failed'>)",[]


In [22]:
# drop unnecessary columns
columns_to_keep = [3, 4, 9, 10, 11, 14, 24, 25, 26, 27, 28, 31]

if max(columns_to_keep) < len(df.columns):
    df = df.iloc[:, columns_to_keep]
else:
    print("❌ One of the column indices exceeds the number of columns in the DataFrame.")

In [23]:
# drop rows with less then two elements
df = df[df.iloc[:, 0] >= 2]

# save to csv

output_path = 'cleaned_elastic_materials.csv'
df.to_csv(output_path, index=False)
print(f"\n💾 Cleaned data saved to '{output_path}'")

# check data structure
df.head(2)


💾 Cleaned data saved to 'cleaned_elastic_materials.csv'


Unnamed: 0,nelements,4,volume,density,density_atomic,14,24,25,26,27,young_modulus,debye_temperature
311,2,"('composition', Composition('B56 C14'))",528.652326,2.429832,7.552176,"('material_id', MPID(mp-530074))","('bulk_modulus', BulkModulus(voigt=224.763, re...","('shear_modulus', ShearModulus(voigt=192.086, ...","('sound_velocity', SoundVelocity(transverse=87...","('thermal_conductivity', ThermalConductivity(c...",,1463.831123
312,2,"('composition', Composition('Si4 O8'))",149.115572,2.676376,12.426298,"('material_id', MPID(mp-556788))","('bulk_modulus', BulkModulus(voigt=52.103, reu...","('shear_modulus', ShearModulus(voigt=45.771, r...","('sound_velocity', SoundVelocity(transverse=37...","('thermal_conductivity', ThermalConductivity(c...",,532.376522


In [24]:
import re

def extract_elements(comp_str):
    if pd.isna(comp_str):
        return []
    
    # Match the formula part inside Composition('...')
    match = re.search(r"Composition\('([^']+)'\)", comp_str)
    if not match:
        return []
    
    formula = match.group(1)
    # Extract element symbols using regex (e.g., "H6 F4" → ["H", "F"])
    elements = re.findall(r'[A-Z][a-z]?', formula)
    return list(set(elements))  # Optional: remove duplicates

# Apply to column 2
df['elements'] = df.iloc[:, 1].apply(extract_elements)

#check data structure
df.head(2)

Unnamed: 0,nelements,4,volume,density,density_atomic,14,24,25,26,27,young_modulus,debye_temperature,elements
311,2,"('composition', Composition('B56 C14'))",528.652326,2.429832,7.552176,"('material_id', MPID(mp-530074))","('bulk_modulus', BulkModulus(voigt=224.763, re...","('shear_modulus', ShearModulus(voigt=192.086, ...","('sound_velocity', SoundVelocity(transverse=87...","('thermal_conductivity', ThermalConductivity(c...",,1463.831123,"[C, B]"
312,2,"('composition', Composition('Si4 O8'))",149.115572,2.676376,12.426298,"('material_id', MPID(mp-556788))","('bulk_modulus', BulkModulus(voigt=52.103, reu...","('shear_modulus', ShearModulus(voigt=45.771, r...","('sound_velocity', SoundVelocity(transverse=37...","('thermal_conductivity', ThermalConductivity(c...",,532.376522,"[Si, O]"


In [26]:
CERAMIC_ELEMENTS = [
    "Si", "Al", "Mg", "Zr", "Ti", "Ca", "Y", "Hf",
    "Fe", "Na", "K", "Ba", "Sr", "Li", "Be", "Mn", "V", "Cr", 
    "Nb", "Mo", "W", "Re", "Sc", "La", "Ce", "Th", "U"
]

ALL_CERAMIC_ELEMENTS = [
    "O", "N", "C", "B", "Si", "Al", "Mg", "Zr", "Ti", "Ca", "Y", "Hf",
    "Fe", "Na", "K", "Ba", "Sr", "Li", "Be", "Mn", "V", "Cr", 
    "Nb", "Mo", "W", "Re", "Sc", "La", "Ce", "Th", "U",
]

# Basic classification (simplified)
NON_METALS = {"O", "N", "C", "B"}
METALS = set(CERAMIC_ELEMENTS) - NON_METALS

# Check if composition has at least one metal and one non-metal
def has_metal_and_nonmetal(elements):
    return any(e in METALS for e in elements) and any(e in NON_METALS for e in elements)

def all_elements_valid(elements):
    return all(e in ALL_CERAMIC_ELEMENTS for e in elements)

# Apply to column 13
df = df[df.iloc[:, 12].apply(all_elements_valid)]
df = df[df.iloc[:, 12].apply(has_metal_and_nonmetal)]

# Save to CSV
df.to_csv("filtered_cleaned_elastic_materials.csv", index=False)
print("✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.")

✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.


In [27]:
# Extract the formula from the composition string
def extract_formula(comp_str):
    match = re.search(r"Composition\('([^']+)'\)", comp_str)
    return match.group(1) if match else None

# Apply to column 2 (index 1)
df.iloc[:, 1] = df.iloc[:, 1].apply(extract_formula)

# show data structure
df.head(2)

Unnamed: 0,nelements,4,volume,density,density_atomic,14,24,25,26,27,young_modulus,debye_temperature,elements
312,2,Si4 O8,149.115572,2.676376,12.426298,"('material_id', MPID(mp-556788))","('bulk_modulus', BulkModulus(voigt=52.103, reu...","('shear_modulus', ShearModulus(voigt=45.771, r...","('sound_velocity', SoundVelocity(transverse=37...","('thermal_conductivity', ThermalConductivity(c...",,532.376522,"[Si, O]"
313,2,Al4 C3,79.802926,2.995483,11.400418,"('material_id', MPID(mp-632442))","('bulk_modulus', BulkModulus(voigt=141.214, re...","('shear_modulus', ShearModulus(voigt=96.492, r...","('sound_velocity', SoundVelocity(transverse=53...","('thermal_conductivity', ThermalConductivity(c...",,787.148256,"[C, Al]"


In [28]:
# Extract material ID from the string
def extract_material_id(id_str):
    match = re.search(r"MPID\(([^)]+)\)", id_str)
    return match.group(1) if match else None

# Apply to column 6 (index 5)
df.iloc[:, 5] = df.iloc[:, 5].apply(extract_material_id)

# Save to CSV
df.to_csv("filtered_cleaned_elastic_materials.csv", index=False)
print("✅ Saved as 'filtered_cleaned_elastic_materials.csv'.")

# show data structure
df.head(2)

✅ Saved as 'filtered_cleaned_elastic_materials.csv'.


Unnamed: 0,nelements,4,volume,density,density_atomic,14,24,25,26,27,young_modulus,debye_temperature,elements
312,2,Si4 O8,149.115572,2.676376,12.426298,mp-556788,"('bulk_modulus', BulkModulus(voigt=52.103, reu...","('shear_modulus', ShearModulus(voigt=45.771, r...","('sound_velocity', SoundVelocity(transverse=37...","('thermal_conductivity', ThermalConductivity(c...",,532.376522,"[Si, O]"
313,2,Al4 C3,79.802926,2.995483,11.400418,mp-632442,"('bulk_modulus', BulkModulus(voigt=141.214, re...","('shear_modulus', ShearModulus(voigt=96.492, r...","('sound_velocity', SoundVelocity(transverse=53...","('thermal_conductivity', ThermalConductivity(c...",,787.148256,"[C, Al]"


In [29]:
# drop unnecessary columns and save
df.drop(df.columns[[0, 10, 12]], axis=1, inplace=True)

# Save to CSV
df.to_csv("filtered_cleaned_elastic_materials.csv", index=False)
print("✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.")

# show data structure
df.head(2)

✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.


Unnamed: 0,4,volume,density,density_atomic,14,24,25,26,27,debye_temperature
312,Si4 O8,149.115572,2.676376,12.426298,mp-556788,"('bulk_modulus', BulkModulus(voigt=52.103, reu...","('shear_modulus', ShearModulus(voigt=45.771, r...","('sound_velocity', SoundVelocity(transverse=37...","('thermal_conductivity', ThermalConductivity(c...",532.376522
313,Al4 C3,79.802926,2.995483,11.400418,mp-632442,"('bulk_modulus', BulkModulus(voigt=141.214, re...","('shear_modulus', ShearModulus(voigt=96.492, r...","('sound_velocity', SoundVelocity(transverse=53...","('thermal_conductivity', ThermalConductivity(c...",787.148256


In [30]:
# Column 7: Extract 'vrh' from BulkModulus
df[df.columns[5]] = df[df.columns[5]].astype(str).str.extract(r"vrh=([\d.]+)").astype(float)

# Column 8: Extract 'vrh' from ShearModulus
df[df.columns[6]] = df[df.columns[6]].astype(str).str.extract(r"vrh=([\d.]+)").astype(float)

# Column 9: Extract 'snyder_total' from SoundVelocity
df[df.columns[7]] = df[df.columns[7]].astype(str).str.extract(r"snyder_total=([\d.]+)").astype(float)

# Column 10: Extract 'clarke' from ThermalConductivity
df[df.columns[8]] = df[df.columns[8]].astype(str).str.extract(r"clarke=([\d.]+)").astype(float)

# Save to CSV
df.to_csv("filtered_cleaned_elastic_materials.csv", index=False)
print("✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.")

# show data structure
df.head(2)

✅ Saved as 'filtered_cleaned_elastic_materials.csv' with only ceramic compositions.


Unnamed: 0,4,volume,density,density_atomic,14,24,25,26,27,debye_temperature
312,Si4 O8,149.115572,2.676376,12.426298,mp-556788,42.63,38.011,32.029102,1.283116,532.376522
313,Al4 C3,79.802926,2.995483,11.400418,mp-632442,137.887,86.229,133.36185,2.004553,787.148256


In [31]:
# update column names
df.columns.values[0] = "Formula"
df.columns.values[1] = "Volume"
df.columns.values[2] = "Density"
df.columns.values[3] = "Atomic Density"
df.columns.values[4] = "IDs"
df.columns.values[5] = "Bulk Modulus"
df.columns.values[6] = "Shear Modulus"
df.columns.values[7] = "Sound Velocity"
df.columns.values[8] = "Thermal Conductivity"
df.columns.values[9] = "Debye Temperature"

# Save to CSV
df.to_csv("filtered_cleaned_elastic_materials.csv", index=False)
print("✅ Saved as 'filtered_cleaned_elastic_materials.csv'.")

# show data structure
df.head(2)

✅ Saved as 'filtered_cleaned_elastic_materials.csv'.


Unnamed: 0,Formula,Volume,Density,Atomic Density,IDs,Bulk Modulus,Shear Modulus,Sound Velocity,Thermal Conductivity,Debye Temperature
312,Si4 O8,149.115572,2.676376,12.426298,mp-556788,42.63,38.011,32.029102,1.283116,532.376522
313,Al4 C3,79.802926,2.995483,11.400418,mp-632442,137.887,86.229,133.36185,2.004553,787.148256
