In [14]:
import pandas as pd
import ast

# 1. Load the CSV (with mixed-type handling)
file_path = '../../all_materials_synthesis.csv'

try:
    df = pd.read_csv(
        file_path,
        low_memory=False,
        na_values=['NA', 'N/A', '--', '-', ''],
        encoding='utf-8'
    )
    print("✅ File loaded successfully!")
except Exception as e:
    print(f"❌ Error loading file: {e}")
    exit()

# 2. Helper function: Safely parse a tuple from string
def try_parse_tuple(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return x

# 3. Step: Parse each cell into a tuple if possible
for col in df.columns:
    df[col] = df[col].apply(try_parse_tuple)

# 4. Step: Create new column names from the first element of the tuple
new_columns = {}
for col in df.columns:
    first_value = df[col].dropna().iloc[0]
    if isinstance(first_value, tuple) and len(first_value) == 2:
        new_col_name = first_value[0]  # First part of the tuple
    else:
        new_col_name = col
    new_columns[col] = new_col_name

df = df.rename(columns=new_columns)

# 5. Step: Replace each cell with just the second value
for col in df.columns:
    df[col] = df[col].apply(lambda x: x[1] if isinstance(x, tuple) and len(x) == 2 else x)
    
output_path = 'cleaned_synthesis_materials.csv'
df.to_csv(output_path, index=False)
print(f"\n💾 Cleaned data saved to '{output_path}'")

# check data structure
df.head(2)

✅ File loaded successfully!

💾 Cleaned data saved to 'cleaned_synthesis_materials.csv'


Unnamed: 0,doi,paragraph_string,2,reaction_string,4,5,targets_formula,precursors_formula,targets_formula_s,precursors_formula_s,10,11,search_score,highlights,fields_not_requested
0,10.1149/2.0691610jes,The solid-state synthesis was used to prepare ...,"('synthesis_type', <SynthesisTypeEnum.solid_st...",0.98 BaCO3 + 0.01 La2O3 + 1 SnO2 == 1 Ba0.98La...,"('reaction', ReactionFormula(left_side=[Formul...","('target', ExtractedMaterial(material_string='...",[Ba0.98 La0.02 Sn1 O3],,[Ba0.98La0.02Sn1O3],"[La2O3, SnO2, BaCO3]","('precursors', [ExtractedMaterial(material_str...","('operations', [Operation(type=<OperationTypeE...",,,"[precursors_formula, search_score, highlights]"
1,10.1039/C5CP01095K,"Synthesis of SrAl12O19, Sr4Al14O25, SrAl2O4, S...","('synthesis_type', <SynthesisTypeEnum.solid_st...",1 Al2O3 + 1 SrCO3 == 1 SrAl2O4 + 1 CO2,"('reaction', ReactionFormula(left_side=[Formul...","('target', ExtractedMaterial(material_string='...",[Sr1 Al2 O4],,[SrAl2O4],"[Al2O3, SrCO3]","('precursors', [ExtractedMaterial(material_str...","('operations', [Operation(type=<OperationTypeE...",,,"[precursors_formula, search_score, highlights]"


In [15]:
# drop unnecessary columns
df = df.drop(df.columns[[1,2,3,4,5,7,9,10,12, 13, 14]], axis=1)

# check data structure
df.head(2)

Unnamed: 0,doi,targets_formula,targets_formula_s,11
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],[Ba0.98La0.02Sn1O3],"('operations', [Operation(type=<OperationTypeE..."
1,10.1039/C5CP01095K,[Sr1 Al2 O4],[SrAl2O4],"('operations', [Operation(type=<OperationTypeE..."


In [16]:
import re

# Access the 4th column
operations_column = df.iloc[:, 3]

def extract_heating_operations(op_string):
    if op_string is None or op_string == '':
        return []

    try:
        start_idx = op_string.find('[')
        end_idx = op_string.rfind(']')
        if start_idx == -1 or end_idx == -1:
            return []
        value_str = op_string[start_idx:end_idx+1]
    except Exception as e:
        return []

    pattern = re.compile(
        r"Operation\(type=<[^:]+: '([^']+)'>, token='([^']+)', conditions=Conditions\(.*?"
        r"heating_temperature=\[(.*?)\], heating_time=\[(.*?)\], heating_atmosphere=\[(.*?)\].*?\)\)"
    )

    def extract_vals(value_str):
        if value_str and isinstance(value_str, str) and value_str.strip():
            return re.findall(r'[-+]?\d*\.\d+|\d+', value_str)
        return []

    heating_ops = []
    for match in pattern.finditer(value_str):
        op_type, token, temp_str, time_str, atmosphere_str = match.groups()
        if op_type != 'HeatingOperation':
            continue  # skip non-heating operations

        temps = extract_vals(temp_str)
        times = extract_vals(time_str)
        atmospheres = re.findall(r"'(.*?)'", atmosphere_str)
        summary = f"{op_type}:{token},T={temps},t={times},atm={atmospheres}"
        heating_ops.append(summary)

    return heating_ops

# Apply extraction
df['parsed_heating_operations'] = operations_column.apply(extract_heating_operations)

# Expand into columns
max_heating_ops = df['parsed_heating_operations'].apply(len).max()

for i in range(max_heating_ops):
    df[f'heating_operation_{i+1}'] = df['parsed_heating_operations'].apply(
        lambda ops: ops[i] if i < len(ops) else None
    )

# Drop intermediate column
df.drop(columns=['parsed_heating_operations'], inplace=True)

# Save
df.to_csv('heating_operations_only.csv', index=False)
print("🔥 Saved with only heating operations split into columns.")

# check data structure
df.head(2)

🔥 Saved with only heating operations split into columns.


Unnamed: 0,doi,targets_formula,targets_formula_s,11,heating_operation_1,heating_operation_2,heating_operation_3,heating_operation_4,heating_operation_5,heating_operation_6,heating_operation_7,heating_operation_8,heating_operation_9,heating_operation_10
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],[Ba0.98La0.02Sn1O3],"('operations', [Operation(type=<OperationTypeE...","HeatingOperation:treated,T=['1300.0', '1300.0'...",,,,,,,,,
1,10.1039/C5CP01095K,[Sr1 Al2 O4],[SrAl2O4],"('operations', [Operation(type=<OperationTypeE...","HeatingOperation:precalcined,T=['1000.0', '100...",,,,,,,,,


In [17]:
# drop unnecessary columns
df = df.drop(df.columns[[2,3,5,6,7,8,9,10,11,12, 13]], axis=1)

# Save
df.to_csv('heating_operations_only.csv', index=False)
print("Saved with only heating operations without unnecessary columns.")

# check data structure
df.head(2)

Saved with only heating operations without unnecessary columns.


Unnamed: 0,doi,targets_formula,heating_operation_1
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],"HeatingOperation:treated,T=['1300.0', '1300.0'..."
1,10.1039/C5CP01095K,[Sr1 Al2 O4],"HeatingOperation:precalcined,T=['1000.0', '100..."


In [18]:
# Select the third column
operations_column = df.iloc[:, 2]

# Function to extract the first value from T, t, and atm
def extract_first_values(op_string):
    if pd.isna(op_string):
        return None, None, None

    temp_match = re.search(r"T=\[(.*?)\]", op_string)
    time_match = re.search(r"t=\[(.*?)\]", op_string)
    atm_match = re.search(r"atm=\[(.*?)\]", op_string)

    def get_first_value(match):
        if match:
            # Split by comma, strip quotes and spaces
            values = re.findall(r"'(.*?)'", match.group(1))
            return values[0] if values else None
        return None

    return (
        get_first_value(temp_match),
        get_first_value(time_match),
        get_first_value(atm_match),
    )

# Apply the extraction
df[['Temperature', 'Time', 'Pressure']] = operations_column.apply(
    lambda x: pd.Series(extract_first_values(x))
)

# Save or preview
df.to_csv('heating_extracted_first_values.csv', index=False)
print("✅ Extracted first values of T, t, atm.")

# check data structure
df.head(2)

✅ Extracted first values of T, t, atm.


Unnamed: 0,doi,targets_formula,heating_operation_1,Temperature,Time,Pressure
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],"HeatingOperation:treated,T=['1300.0', '1300.0'...",1300.0,6.0,
1,10.1039/C5CP01095K,[Sr1 Al2 O4],"HeatingOperation:precalcined,T=['1000.0', '100...",1000.0,,air


In [19]:
# Drop rows with no synthesis data
df = df[df.iloc[:, 3].notna() & (df.iloc[:, 3].astype(str).str.strip() != '')]

# Drop the synthesis column
df.drop(df.columns[2], axis=1, inplace=True)

# check data structure
df.head(2)

Unnamed: 0,doi,targets_formula,Temperature,Time,Pressure
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],1300.0,6.0,
1,10.1039/C5CP01095K,[Sr1 Al2 O4],1000.0,,air


In [20]:
df.rename(columns={df.columns[1]: 'Formula'}, inplace=True)

# Save to a clean data file
df.to_csv('clean_synthesis_data.csv', index=False)
print("✅ Saved the clean sythesis data!")


# check data structure
df.head(2)

✅ Saved the clean sythesis data!


Unnamed: 0,doi,Formula,Temperature,Time,Pressure
0,10.1149/2.0691610jes,[Ba0.98 La0.02 Sn1 O3],1300.0,6.0,
1,10.1039/C5CP01095K,[Sr1 Al2 O4],1000.0,,air


In [21]:
# clean up formula
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)


# check data structure
df.head(2)

Unnamed: 0,doi,Formula,Temperature,Time,Pressure
0,10.1149/2.0691610jes,Ba0.98 La0.02 Sn1 O3,1300.0,6.0,
1,10.1039/C5CP01095K,Sr1 Al2 O4,1000.0,,air


In [22]:
def extract_elements(value):
    # Safely extract the first item if it's a non-empty list
    if isinstance(value, list) and len(value) > 0:
        value = value[0]
    elif isinstance(value, str):
        pass  # Already a string
    else:
        return []  # Return empty list if value is None or an empty list

    return re.findall(r'[A-Z][a-z]?', value)

# Apply safely to the second column
df['Elements'] = df.iloc[:, 1].apply(extract_elements)

# check data structure
df.head(2)

Unnamed: 0,doi,Formula,Temperature,Time,Pressure,Elements
0,10.1149/2.0691610jes,Ba0.98 La0.02 Sn1 O3,1300.0,6.0,,"[Ba, La, Sn, O]"
1,10.1039/C5CP01095K,Sr1 Al2 O4,1000.0,,air,"[Sr, Al, O]"


In [23]:
CERAMIC_ELEMENTS = [
    "Si", "Al", "Mg", "Zr", "Ti", "Ca", "Y", "Hf",
    "Fe", "Na", "K", "Ba", "Sr", "Li", "Be", "Mn", "V", "Cr", 
    "Nb", "Mo", "W", "Re", "Sc", "La", "Ce", "Th", "U"
]

ALL_CERAMIC_ELEMENTS = [
    "O", "N", "C", "B", "Si", "Al", "Mg", "Zr", "Ti", "Ca", "Y", "Hf",
    "Fe", "Na", "K", "Ba", "Sr", "Li", "Be", "Mn", "V", "Cr", 
    "Nb", "Mo", "W", "Re", "Sc", "La", "Ce", "Th", "U",
]

# Basic classification (simplified)
NON_METALS = {"O", "N", "C", "B"}
METALS = set(CERAMIC_ELEMENTS) - NON_METALS

# Check if composition has at least one metal and one non-metal
def has_metal_and_nonmetal(elements):
    return any(e in METALS for e in elements) and any(e in NON_METALS for e in elements)

def all_elements_valid(elements):
    return all(e in ALL_CERAMIC_ELEMENTS for e in elements)

# Apply to column 6
df = df[df.iloc[:, 5].apply(all_elements_valid)]
df = df[df.iloc[:, 5].apply(has_metal_and_nonmetal)]

# Save to a clean data file
df.to_csv('clean_synthesis_data.csv', index=False)
print("✅ Saved the clean sythesis data!")

# check data structure
df.head(2)

✅ Saved the clean sythesis data!


Unnamed: 0,doi,Formula,Temperature,Time,Pressure,Elements
1,10.1039/C5CP01095K,Sr1 Al2 O4,1000.0,,air,"[Sr, Al, O]"
6,10.1149/2.051201jes,Na2 Ti3 O7,1000.0,24.0,,"[Na, Ti, O]"


In [24]:
# Ensure column 3 (index 2) is numeric
df[df.columns[2]] = pd.to_numeric(df[df.columns[2]], errors='coerce')

# remove outliers
df = df[df[df.columns[2]] <= 7000.0]

# drop column 6
df.drop(df.columns[5], axis=1, inplace=True)

# Save to the clean data file
df.to_csv('clean_synthesis_data.csv', index=False)
print("✅ Saved the clean sythesis data!")

# check data structure
df.head(2)

✅ Saved the clean sythesis data!


Unnamed: 0,doi,Formula,Temperature,Time,Pressure
1,10.1039/C5CP01095K,Sr1 Al2 O4,1000.0,,air
6,10.1149/2.051201jes,Na2 Ti3 O7,1000.0,24.0,


In [25]:
df.head(291)

Unnamed: 0,doi,Formula,Temperature,Time,Pressure
1,10.1039/C5CP01095K,Sr1 Al2 O4,1000.0,,air
6,10.1149/2.051201jes,Na2 Ti3 O7,1000.0,24.0,
7,10.1149/1.3479763,Ba1 Si2 N2 O2,1200.0,6.0,N2(95%)-H2(5
11,10.1149/1.3527983,Li4 Ti5 O12,235.0,16.0,
12,10.1149/2.0681512jes,Ce0.6 Mn0.3 Fe0.1 O2,1273.0,3.0,
...,...,...,...,...,...
1277,10.1016/j.jlumin.2010.06.012,"[Mg1.4 Mn0.6 B2 O5, Mg2 B2 O5]",900.0,6.0,
1288,10.1016/j.jeurceramsoc.2007.01.018,Ca1 Mg0.75 Al0.5 Si1.75 O6,900.0,1.0,
1292,10.1016/j.jssc.2005.04.022,Ba1 Zr1 O3,1773.0,1.0,
1294,10.1016/j.matlet.2006.05.038,Mg1 Ti2 O4,1300.0,1.0,
