In [14]:
import pandas as pd
import numpy as np
import json
from utils.classes import *
from utils.exclude import *

filename = r'data/solid-state_dataset_20200713.json'
filedata = open(filename, mode='r').read()
jsonParse = json.loads(filedata)

# import glob
# # Define the path to the folder containing the CSV files
# folder_path = 'sinter_dataset/*.csv'

# # Use glob to match all csv file paths in the folder
# csv_files = glob.glob(folder_path)

# # Initialize an empty list to store dataframes
# dfs = []

# # Loop through the list of csv file paths
# for file in csv_files:
#     # Read the current csv file into a dataframe
#     df = pd.read_csv(file)
#     # Append the dataframe to the list of dataframes
#     dfs.append(df)

# # Concatenate all dataframes in the list into a single dataframe
# merged_df = pd.concat(dfs, ignore_index=True)
# merged_df = merged_df.groupby('formula').agg(lambda x: list(set(x))).reset_index()
# merged_df = merged_df.drop(columns=['Unnamed: 0'])
# merged_df.set_index('formula', inplace=True)
# # Display the merged dataframe
# print(merged_df)

reactions = [from_dict(reaction, ReactionEntry) for reaction in jsonParse['reactions']]

In [None]:
'''
Things the original code filter had that I don't think should exist:
1.  -0.667 (NH4)3C3H5O(COO)3 + 2 FeC2O4·2H2O + 2 NH3 + 1 SiO2 == 1 Fe2SiO4 + 1.333 H2O + 2 O2
    This is outlier code: -0.667 as formula part multiplicity?
2.  0.333 Co3O4 + 0.5-0.5*x Nd2O3 + 0.083+0.25*x O2 + x SrCO3 == 1 Nd1-xSrxCoO3 + x CO2; 1 CuO + 1-x La2O3 + 0.5*x O2 + 2*x SrCO3 == 1 (La1-xSrx)2CuO4 + 2*x CO2 and many
    This isn't allowed, right? But this is as part of good list
'''

In [7]:
import json

# Define the list of bad elements/formulas
bad_list = ['*', '-', 'x', '+', '/', 'ac', '(2N)', '(3N)', '(4N)', '(5N)', '(6N)', '7LiOH', '2Ni(OH)2']

# Load solid-state dataset
with open('data/solid-state_dataset_20200713.json') as f:
    ss_data = json.load(f)

# Extract reactions from dataset
reactions = ss_data["reactions"]

# Minimum number of precursors
min_pre = 2

# Initialize filtered reactions list
filtered_reactions = []

# Filter reactions based on specified criteria
for rxn in reactions:
    if rxn["doi"] in BAD_DOI:
        continue
    if len(rxn["precursors"]) < min_pre:
        continue
    if any(rxn["targets_string"][x] in BAD_TARGETS for x in range(len(rxn["targets_string"]))):
        continue
    if any(rxn["precursors"][x]["material_formula"] in BAD_PRECURSORS for x in range(len(rxn["precursors"]))):
        continue
    
    found_bad = False
    for bad in bad_list:
        if any(bad in rxn["targets_string"][x] for x in range(len(rxn["targets_string"]))) or \
           any(bad in rxn["precursors"][x]["material_formula"] for x in range(len(rxn["precursors"]))):
            # print(list(rxn["targets_string"][x] for x in range(len(rxn["targets_string"]))))
            found_bad = True
    
    if found_bad:
        continue
    else:
        filtered_reactions.append(rxn)

# Convert filtered reactions to ReactionEntry objects
reactions_m2 = [from_dict(reaction, ReactionEntry) for reaction in filtered_reactions]


In [15]:
rxn_string = '1 CuO + 1-x La2O3 + 0.5*x O2 + 2*x SrCO3 == 1 (La1-xSrx)2CuO4 + 2*x CO2'
x1 = [x for x in reactions if x.reaction_string == rxn_string][0]

In [16]:
import re
[not bool(re.match(r'^-?\d+(\.\d+)?$', s.amount)) for s in x1.reaction.left_side]

[True, True, False, True]

In [17]:
x1.precursors

[Material(material_string='SrCO3', material_formula='SrCO3', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='SrCO3', amount='1', elements={'Sr': '1', 'C': '1', 'O': '3'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None),
 Material(material_string='La2O3', material_formula='La2O3', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='La2O3', amount='1', elements={'La': '2', 'O': '3'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None),
 Material(material_string='CuO', material_formula='CuO', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='CuO', amount='1', elements={'Cu': '1', 'O': '1'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None)]

In [18]:
x1.precursors

[Material(material_string='SrCO3', material_formula='SrCO3', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='SrCO3', amount='1', elements={'Sr': '1', 'C': '1', 'O': '3'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None),
 Material(material_string='La2O3', material_formula='La2O3', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='La2O3', amount='1', elements={'La': '2', 'O': '3'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None),
 Material(material_string='CuO', material_formula='CuO', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='CuO', amount='1', elements={'Cu': '1', 'O': '1'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None)]

In [19]:
def RemoveBadEntries(reactions: list,
                        min_precursors = 2,
                        remove_bad_doi = True,
                        remove_bad_target = True,
                        remove_bad_precursor = True, 
                        remove_duplicates_via_doi = True,
                        remove_invalid_coefficients_multiplicities = True,
                        use_bad_list = True,
                        remove_negative_coefficients = True, 
                        verbose_output = True):
    filtered_reactions = []
    bad_list = ['*', '-', 'x', '+', '/', 'ac', '(2N)', '(3N)', '(4N)', '(5N)', '(6N)', '7LiOH', '2Ni(OH)2']
    isDigitRegex = re.compile(r'^-?\d+(\.\d+)?$')
    isNegativeRegex = re.compile(r'^(0*[1-9]\d*|0*\d*\.\d*[1-9])$')
    RegexSelected = isNegativeRegex
    if not remove_negative_coefficients:
        RegexSelected = isDigitRegex
    for reaction in reactions:
        rxn: ReactionEntry = reaction
        if (verbose_output): print(rxn.reaction_string, end='')
        if remove_bad_doi and rxn.doi in BAD_DOI: 
            if (verbose_output): print(": REJECTED DUE TO BAD DOI")
            continue
        if len(rxn.precursors) < min_precursors: 
            if (verbose_output): print(": REJECTED DUE TO LOW PRECURSOR COUNT")
            continue
        if remove_bad_target and any(target in BAD_TARGETS for target in (rxn.targets_string)):  
            if (verbose_output): print(": REJECTED DUE TO BAD TARGET")
            continue
        if remove_bad_precursor and  any(precursor.material_formula in BAD_PRECURSORS for precursor in rxn.precursors):  
            if (verbose_output): print(": REJECTED DUE TO BAD PRECURSOR")
            continue
        # if any([not bool(isDigitRegex.match(s.amount)) for s in rxn.reaction.left_side]):
        #     if (verbose_output): print(": REJECTED DUE TO UNKNOWN COEFFICIENT IN LHS")
        #     continue
        # if any([not bool(isDigitRegex.match(s.amount)) for s in rxn.reaction.right_side]):
        #     if (verbose_output): print(": REJECTED DUE TO UNKNOWN COEFFICIENT IN RHS")
        #     continue 
        if remove_invalid_coefficients_multiplicities and any([not bool(RegexSelected.match(s.amount)) for s in rxn.reaction.left_side]):
            if (verbose_output): print(": REJECTED DUE TO INVALID COEFFICIENT IN LHS")
            continue
        if remove_invalid_coefficients_multiplicities and any([not bool(RegexSelected.match(s.amount)) for s in rxn.reaction.right_side]):
            if (verbose_output): print(": REJECTED DUE TO INVALID COEFFICIENT IN RHS")
            continue

        found_bad = False
        if use_bad_list: 
            for bad in bad_list:
                if(any(bad in target_string for target_string in rxn.targets_string)) \
                or any(bad in precursor.material_formula for precursor in rxn.precursors):
                    found_bad = True
        
        if found_bad:  
            if (verbose_output): print(": REJECTED CHARACTER FROM BAD LIST")
            continue
        else:
            if (verbose_output): print(": SELECTED") 
            filtered_reactions.append(rxn)
    print("Filtered", len(filtered_reactions), "reactions out of total", len(reactions))
    return filtered_reactions

In [20]:
def NormalizePrecursors(reactions: list):
    PrecursorMaterialReplacements = {}
    for key, value in PREC_REPLACEMENTS.items():
        PrecursorKey    = [ material for reaction in reactions for material in reaction.precursors if material.material_formula == key]
        PrecursorValue  = [ material for reaction in reactions for material in reaction.precursors if material.material_formula == value]
        filtered_reactions = [reaction for reaction in reactions for material in reaction.precursors if key in material.material_formula]
        number_replacements = len(filtered_reactions)
        if(len(PrecursorKey) > 0):
            for rxn in filtered_reactions:
                for prec in rxn.precursors:
                    if prec.material_formula == PrecursorKey[0].material_formula: 
                        #print("replace here")
                        prec = PrecursorValue[0]
                    if PrecursorKey[0].material_formula in rxn.reaction_string:
                        rxn.reaction_string.replace(PrecursorKey[0].material_formula, PrecursorValue[0].material_formula)
                    # TODO: You still have to replace Formula parts in rxn.reaction. Find a proposal that works.
            PrecursorMaterialReplacements[PrecursorKey[0]] = PrecursorValue[0]
            print("Processed:", key, '=', value, ": replaced", number_replacements, " places")
        else: print("Skipped:", key)
    return reactions

def RemoveDuplicates(reactions: list):
    return reactions

In [143]:
[rxn.reaction for rxn in NormalizePrecursors(reactions.copy()) if 'Ce(NO3)4' in str(rxn)]

Processed: Ce(NO3)4 = Ce(NO3)3 : replaced 23  places
Processed: Ni(CH3COO)3 = Ni(CH3COO)2 : replaced 24  places
Processed: Fe(NO3)2 = Fe(NO3)3 : replaced 11  places
Skipped: Fe(NO3)
Processed: LiCO3 = Li2CO3 : replaced 41  places
Processed: Ba2CO3 = BaCO3 : replaced 31  places
Processed: Cu(NO3)3 = Cu(NO3)2 : replaced 2  places
Processed: Ca2CO3 = CaCO3 : replaced 7  places
Skipped: CH3COONi
Skipped: Co(NO3)
Processed: Co(CH3COO)3 = Co(CH3COO)2 : replaced 5  places
Skipped: Mg(CH3COO)
Processed: Mn(NO3)3 = Mn(NO3)2 : replaced 11  places
Processed: Mn(NO3)7 = Mn(NO3)2 : replaced 4  places
Processed: Ni(NO3)3 = Ni(NO3)2 : replaced 15  places


[Formula(left_side=[FormulaPart(amount='-4', material='C'), FormulaPart(amount='2', material='C2H5OH'), FormulaPart(amount='1', material='CaSO4')], right_side=[FormulaPart(amount='1', material='CaS'), FormulaPart(amount='6', material='H2O')], element_substitution={}),
 Formula(left_side=[FormulaPart(amount='-1.625', material='C'), FormulaPart(amount='0.75', material='Dy(NO3)3'), FormulaPart(amount='0.812', material='C2H5OH'), FormulaPart(amount='0.25', material='Ce(NO3)4'), FormulaPart(amount='3.25', material='[OH-]')], right_side=[FormulaPart(amount='1', material='Ce0.25Dy0.75'), FormulaPart(amount='4.062', material='H2O'), FormulaPart(amount='3.25', material='[NO3-]')], element_substitution={}),
 Formula(left_side=[FormulaPart(amount='-1.75', material='C'), FormulaPart(amount='0.5', material='Dy(NO3)3'), FormulaPart(amount='0.875', material='C2H5OH'), FormulaPart(amount='0.5', material='Ce(NO3)4'), FormulaPart(amount='3.5', material='[OH-]')], right_side=[FormulaPart(amount='1', mate

In [21]:
filteredRxns = RemoveBadEntries(reactions, verbose_output=False)

Filtered 20093 reactions out of total 31782


In [22]:
reactions[:5]

[ReactionEntry(doi='10.1149/1.1383553', paragraph_string='High surface area activated carbons were obtained  <...> ce of these materials will be published elsewhere.', synthesis_type='solid-state', reaction_string='2 Li2CO3 + 5 TiO2 == 1 Li4Ti5O12 + 2 CO2', reaction=Formula(left_side=[FormulaPart(amount='5', material='TiO2'), FormulaPart(amount='2', material='Li2CO3')], right_side=[FormulaPart(amount='1', material='Li4Ti5O12'), FormulaPart(amount='2', material='CO2')], element_substitution={}), targets_string=['Li4Ti5O12'], target=Material(material_string='Li4Ti5O12', material_formula='Li4Ti5O12', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='Li4Ti5O12', amount='1', elements={'Li': '4', 'Ti': '5', 'O': '12'})], amount_vars={}, element_vars={}, additives=[], oxygen_deficiency=None), precursors=[Material(material_string='TiO2', material_formula='TiO2', material_name='', phase=None, is_acronym=False, composition=[Composition(formula='TiO2', amount='1', 

In [23]:
from collections import defaultdict

# Assuming 'reactions' is your list of ReactionEntry objects

# Create a defaultdict to store entries grouped by (doi, reaction_string)
entry_dict = defaultdict(list)
for entry in filteredRxns:
    entry_dict[(entry.doi, entry.reaction_string)].append(entry)

# Filter out entries where there are duplicates (keep only the first occurrence)
filtered_reactions_doi = []
seen_keys = set()
for entry in filteredRxns:
    key = (entry.doi, entry.reaction_string)
    if key not in seen_keys:
        seen_keys.add(key)
        filtered_reactions_doi.append(entry)

# Now 'filtered_reactions' contains entries with duplicates removed based on (doi, reaction_string)

len(filtered_reactions_doi)

19318

In [None]:
# Dictionary to store entries grouped by (right_side_tuple, amount_tuple)
duplicate_entries = defaultdict(list)
filtered_reactions_dupForm = []
for entry in filtered_reactions_doi:
    # Create a tuple representation of right_side
    right_side_tuple = tuple((part.amount, part.material) for part in entry.reaction.right_side)

    # Create sets of (amount, material) tuples for target and precursors
    target_materials = {(comp.amount, comp.formula) for comp in entry.target.composition}
    precursor_materials =  {(mat.amount, mat.formula) for composition in (material.composition for material in reactions[0].precursors) for mat in composition}

    # Create a tuple for (target materials, precursor materials)
    materials_tuple = (frozenset(target_materials), frozenset(precursor_materials))

    key = (right_side_tuple, materials_tuple)
    duplicate_entries[key].append(entry)

# Now, find and print duplicates
seen_keys = set()
for key, entries in duplicate_entries.items():
    if len(entries) > 1:
        if key not in seen_keys:
            seen_keys.add(key)
            filtered_reactions_dupForm.append(entry)
        print(len(entries), "\tDuplicates for: ", end='')
        print(f"Right Side: {key[0]}", end='')
        print("Target materials:", end='')
        print(key[1][0], end='')
        print("Precursor materials:", end='')
        print(key[1][1], end='')
        print()
        for i in range(len(entries)):
            entry = entries[i]
            print("Entry #{}: ".format(i), end='')
            calc_operations = [op for op in entry.operations if "calc" in op.token]
            print("No. Calcination Operations: {}, ".format(len(calc_operations)))
            for op in calc_operations:
                print(op)


In [25]:
print("Original Dataset Size: \t\t\t\t", len(reactions))
print("1st level reduction (Revised Bad List): \t", len(filteredRxns))
print("2nd level reduction (DOIs): \t\t\t", len(filtered_reactions_doi))
print("3rd level reduction (affinity + fuzzy): \t", len(filtered_reactions_dupForm))

Original Dataset Size: 				 31782
1st level reduction (Revised Bad List): 	 20093
2nd level reduction (DOIs): 			 19318
3rd level reduction (affinity + fuzzy): 	 2962
