In [30]:
import pandas as pd

# Load CSVs
thermo_df = pd.read_csv('../Cleaners/Thermo_Cleaner/filtered_cleaned_thermo_materials.csv')
elastic_df = pd.read_csv('../Cleaners/Elastic_Cleaner/filtered_cleaned_elastic_materials.csv')
synthesis_df = pd.read_csv('../Cleaners/Synthesis_Cleaner/clean_synthesis_data.csv')

# Perform inner join on 'id', avoiding duplicated columns by using suffixes
no_synth_merged_df = pd.merge(thermo_df, elastic_df, on='IDs', how='inner', suffixes=('_file1', '_file2'))

# OPTIONAL: If some columns were duplicated and you only want one version:
# For example, keep only '_file1' version of duplicated columns
for col in thermo_df.columns:
    if col in elastic_df.columns and col != 'IDs':
        no_synth_merged_df.drop(f'{col}_file2', axis=1, inplace=True)
        no_synth_merged_df.rename(columns={f'{col}_file1': col}, inplace=True)

# Save result
no_synth_merged_df.to_csv('merged_no_synthesis_dataset.csv', index=False)

print("Cleaned Merged DataFrame:")
no_synth_merged_df.head()


Cleaned Merged DataFrame:


Unnamed: 0,Formula,Volume,Density,IDs,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,Sound Velocity,Thermal Conductivity,Debye Temperature
0,Ti6 O10,177.344754,4.187248,mp-1147,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,76.573,81.907435,1.669621,638.064803
1,Nb6 C5,136.649587,7.503639,mp-2760,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,160.992,183.642876,1.621313,658.903137
2,Sr4 N4,175.124472,3.854518,mp-29973,-5.536622,-5.717122,-0.704128,-0.003838,21.874482,39.109,25.771,22.546062,0.622534,304.774562
3,Ti4 O8,146.706282,3.61594,mp-554278,-8.974109,-9.432109,-3.501638,-0.497244,12.21829,180.571,60.573,81.270062,1.521814,594.584246
4,Si2 O4,94.261854,2.116918,mp-546794,-7.916264,-8.374264,-3.267184,-3.267184,15.598843,35.06,33.443,39.054682,1.149637,517.729991


In [31]:
# Identify duplicate columns (excluding the join key)
duplicate_columns = [col for col in synthesis_df.columns if col in no_synth_merged_df.columns and col != 'Formula']

# Drop duplicates from synthesis_df before merging
synthesis_df_cleaned = synthesis_df.drop(columns=duplicate_columns)

# Merge the two DataFrames on 'IDs'
final_merged_df = pd.merge(no_synth_merged_df, synthesis_df_cleaned, on='Formula', how='inner')

# Save to CSV
final_merged_df.to_csv('merged_synthesis_dataset.csv', index=False)

print("Final merged DataFrame without duplicated columns:")
final_merged_df.head()


Final merged DataFrame without duplicated columns:


Unnamed: 0,Formula,Volume,Density,IDs,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,Sound Velocity,Thermal Conductivity,Debye Temperature,doi,Temperature,Time,Pressure
0,Th1 Ti2 O6,118.5591,5.935302,mp-5274,-9.209376,-9.667376,-3.79042,-6.4e-05,13.159042,150.848,62.689,67.504615,1.136242,458.560772,10.1016/S0022-3115(03)00186-7,1350.0,100.0,air
1,Th1 Ti2 O6,118.5591,5.935302,mp-5274,-9.209376,-9.667376,-3.79042,-6.4e-05,13.159042,150.848,62.689,67.504615,1.136242,458.560772,10.1016/s0022-3115(03)00186-7,1350.0,100.0,air
2,Sc2 Si2 O7,130.079601,3.294518,mp-5594,-8.456747,-8.893929,-3.607447,-0.021378,11.807528,160.151,72.463,97.745883,1.752712,684.691917,10.1016/j.jeurceramsoc.2018.03.010,1600.0,4.0,air
3,Sc2 Si2 O7,130.079601,3.294518,mp-5594,-8.456747,-8.893929,-3.607447,-0.021378,11.807528,160.151,72.463,97.745883,1.752712,684.691917,10.1016/j.optmat.2011.12.007,1400.0,5.0,
4,Sr3 Ti2 O7,159.282547,4.905953,mp-3349,-7.717111,-8.117861,-3.493262,-0.004,13.254011,139.441,83.293,88.560831,1.396738,575.129777,10.1016/j.ijhydene.2005.10.005,1100.0,30.0,


In [32]:
import re

# Define unique ceramic elements
CERAMIC_ELEMENTS = [
    "O", "N", "C", "B", "Si", "Al", "Mg", "Zr", "Ti", "Ca", "Y", "Hf",
    "Fe", "Na", "K", "Ba", "Sr", "Li", "Be", "Mn", "V", "Cr", 
    "Nb", "Mo", "W", "Re", "Sc", "La", "Ce", "Th", "U"
]
CERAMIC_ELEMENTS = list(dict.fromkeys(CERAMIC_ELEMENTS))  # Remove duplicates, preserve order

# Add ceramic element columns (if not already present)
for element in CERAMIC_ELEMENTS:
    if element not in no_synth_merged_df.columns:
        no_synth_merged_df[element] = 0

# Function to parse formulas like "Sr1 Al2 O4"
def parse_formula(formula):
    matches = re.findall(r'([A-Z][a-z]?)(\d*\.?\d*)', str(formula))
    return {el: float(cnt) if cnt else 1.0 for el, cnt in matches}

# Apply parsing to each row in the 'Formula' column
for idx, formula in no_synth_merged_df['Formula'].items():
    parsed = parse_formula(formula)
    for element, value in parsed.items():
        if element in CERAMIC_ELEMENTS:
            no_synth_merged_df.at[idx, element] = value

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

Unnamed: 0,Formula,Volume,Density,IDs,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,...,Cr,Nb,Mo,W,Re,Sc,La,Ce,Th,U
0,Ti6 O10,177.344754,4.187248,mp-1147,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,...,0,0,0,0,0,0,0,0,0,0
1,Nb6 C5,136.649587,7.503639,mp-2760,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,...,0,6,0,0,0,0,0,0,0,0


In [33]:
def melting_temp_fine(G):
    return 553 + 5.91 * G + 0.00055 * G**2

# Add new columns for estimated melting temperatures
no_synth_merged_df["Tm"] = melting_temp_fine(no_synth_merged_df["Shear Modulus"])

# Print the resulting DataFrame
print(no_synth_merged_df[["Formula", "Tm"]])

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

              Formula           Tm
0             Ti6 O10  1008.771313
1              Nb6 C5  1518.717853
2              Sr4 N4   705.671889
3              Ti4 O8   913.004429
4              Si2 O4   751.263269
...               ...          ...
1322   Ce2 Re2 Si2 C1   908.121889
1323  Ca4 Ti4 Si4 O20          NaN
1324    Ca1 Mg1 C2 O6   798.164060
1325   Y8 Si16 C4 N24  1165.613459
1326     K2 Ca1 C2 O6   682.028174

[1327 rows x 2 columns]


Unnamed: 0,Formula,Volume,Density,IDs,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,...,Nb,Mo,W,Re,Sc,La,Ce,Th,U,Tm
0,Ti6 O10,177.344754,4.187248,mp-1147,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,...,0,0,0,0,0,0,0,0,0,1008.771313
1,Nb6 C5,136.649587,7.503639,mp-2760,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,...,6,0,0,0,0,0,0,0,0,1518.717853


In [34]:
# drop unnecessary columns and save
no_synth_merged_df.drop(no_synth_merged_df.columns[[3]], axis=1, inplace=True)

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

Unnamed: 0,Formula,Volume,Density,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,...,Nb,Mo,W,Re,Sc,La,Ce,Th,U,Tm
0,Ti6 O10,177.344754,4.187248,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,76.573,...,0,0,0,0,0,0,0,0,0,1008.771313
1,Nb6 C5,136.649587,7.503639,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,160.992,...,6,0,0,0,0,0,0,0,0,1518.717853


In [35]:
# drop rows with two or more missing values
no_synth_merged_df = no_synth_merged_df.dropna(thresh=no_synth_merged_df.shape[1] - 1)

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

Unnamed: 0,Formula,Volume,Density,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,...,Nb,Mo,W,Re,Sc,La,Ce,Th,U,Tm
0,Ti6 O10,177.344754,4.187248,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,76.573,...,0,0,0,0,0,0,0,0,0,1008.771313
1,Nb6 C5,136.649587,7.503639,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,160.992,...,6,0,0,0,0,0,0,0,0,1518.717853


In [36]:
# drop rows with outlying debye temperatures
no_synth_merged_df = no_synth_merged_df[no_synth_merged_df['Debye Temperature'] <= 2000]

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

Unnamed: 0,Formula,Volume,Density,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,...,Nb,Mo,W,Re,Sc,La,Ce,Th,U,Tm
0,Ti6 O10,177.344754,4.187248,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,76.573,...,0,0,0,0,0,0,0,0,0,1008.771313
1,Nb6 C5,136.649587,7.503639,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,160.992,...,6,0,0,0,0,0,0,0,0,1518.717853


In [37]:
# Function to extract elements from formula string
def extract_elements(formula):
    # Use regex to find all elements - elements start with capital letter followed by optional lowercase letters
    # This will extract symbols like Nb, P, Y, S, etc.
    elements = re.findall(r'[A-Z][a-z]?', formula)
    return elements

# Apply the function to create a new column 'Elements'
no_synth_merged_df['Elements'] = no_synth_merged_df['Formula'].apply(extract_elements)

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
no_synth_merged_df.head(2)

Unnamed: 0,Formula,Volume,Density,Uncorrected EPA,EPA,Formation EPA,Equilibrium Reaction EPA,Atomic Density,Bulk Modulus,Shear Modulus,...,Mo,W,Re,Sc,La,Ce,Th,U,Tm,Elements
0,Ti6 O10,177.344754,4.187248,-9.003853,-9.433228,-3.379943,-0.002179,11.067622,170.755,76.573,...,0,0,0,0,0,0,0,0,1008.771313,"[Ti, O]"
1,Nb6 C5,136.649587,7.503639,-10.249248,-10.249248,-0.544597,-0.163298,12.421358,235.464,160.992,...,0,0,0,0,0,0,0,0,1518.717853,"[Nb, C]"


In [38]:
# Select every 6th row (starting from index 5) to creat a testing dataset
test_df = no_synth_merged_df.iloc[5::6]

# Drop those rows from the original DataFrame
no_synth_merged_df.drop(no_synth_merged_df.index[5::6], inplace=True)

# Save the result
no_synth_merged_df.to_csv("updated_with_coefficients.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

print("All Done")

All Done
