In [17]:
import json
with open("mp_data.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

In [15]:
for item in data:
    if len(item['elements']) == 6:
        print(item['formula_pretty'])

H3PbCI3NF3


In [20]:
from collections import Counter
# Count the length of each element list
lengths = [len(item['elements']) for item in data]

# Use Counter to count the frequency of each length
length_counts = Counter(lengths)

# Print results
for length, count in sorted(length_counts.items()):
    print(f"Number of samples with {length} elements: {count}")


Number of samples with 1 elements: 200
Number of samples with 2 elements: 3967
Number of samples with 3 elements: 5971
Number of samples with 4 elements: 361
Number of samples with 5 elements: 20
Number of samples with 6 elements: 1


In [21]:
# Get unique element lists by converting to tuples (which are hashable)
unique_element_lists = set(frozenset(item['elements']) for item in data)

# Count the length of each unique element list
lengths = [len(elements) for elements in unique_element_lists]

# Use Counter to count the frequency of each length
length_counts = Counter(lengths)

# Print results
print("Number of unique materials lists:")
for length, count in sorted(length_counts.items()):
    print(f"Number of unique materials lists with {length} elements: {count}")

Number of unique materials lists:
Number of unique materials lists with 1 elements: 74
Number of unique materials lists with 2 elements: 1741
Number of unique materials lists with 3 elements: 4794
Number of unique materials lists with 4 elements: 344
Number of unique materials lists with 5 elements: 20
Number of unique materials lists with 6 elements: 1


In [8]:
elements = set()
for item in data:
    for el in item["elements"]:
        elements.add(el)
elements = sorted(list(elements))
print(elements)
print(len(elements))

['Ac', 'Ag', 'Al', 'Ar', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir', 'K', 'Kr', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb', 'Pd', 'Pm', 'Pr', 'Pt', 'Pu', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Xe', 'Y', 'Zn', 'Zr']
88


In [9]:
with open("../reproduce/data/prompt_type_4/ec_desc_test_dataset_with_mpid.json", "r") as f:
    data_test = json.load(f)
elements_test = set()
import re
from pymatgen.core.composition import Composition

for item in data_test:
    input_text = item['input']
    if "The material" in input_text:
        material_match = re.search(r"The material (\w+) with", input_text)
        if material_match:
            material_formula = material_match.group(1)
            comp = Composition(material_formula)
            for el in comp.elements:
                elements_test.add(str(el))
print(elements_test)
print(len(elements_test))


{'Ti', 'Tb', 'Rh', 'C', 'Zn', 'Ga', 'Pr', 'W', 'Te', 'Os', 'Th', 'Bi', 'Ge', 'Cu', 'Ru', 'Fe', 'Y', 'Si', 'Br', 'Np', 'V', 'La', 'Nb', 'Cs', 'Sr', 'Sn', 'Au', 'Mn', 'Sc', 'Cd', 'Tl', 'In', 'Ir', 'Eu', 'Co', 'S', 'O', 'Pb', 'H', 'U', 'Cl', 'N', 'Ho', 'Rb', 'Nd', 'Li', 'Hf', 'Tm', 'Zr', 'Ba', 'F', 'Er', 'Tc', 'K', 'Mg', 'Pd', 'Ne', 'Na', 'I', 'Ac', 'Se', 'Pt', 'Ta', 'Cr', 'B', 'Ce', 'Al', 'Mo', 'Be', 'Sb', 'Pa', 'Ni', 'Sm', 'Ag', 'Lu', 'P', 'As', 'Re', 'Hg', 'Dy', 'Ca'}
81


In [8]:
elements_not_in_training = elements_test - set(elements)
if elements_not_in_training:
    print(f"Warning: Found elements in test set that are not in training set: {elements_not_in_training}")

In [1]:
import json
from pymatgen.core.composition import Composition
from collections import Counter

with open("../reproduce/data/temp_data/combined_temp_data.json", "r") as f:
    temp_data_raw = json.load(f)
    temp_data = [x for x in temp_data_raw if x['pressure'] == 1 and x['temperature'] != 0]

# Count number of elements in each formula
element_counts = []
crystal_systems = []

for item in temp_data:
    # Count elements
    formula = item['formula']
    comp = Composition(formula)
    num_elements = len(comp.elements)
    element_counts.append(num_elements)
    
    # Get crystal system
    crystal_systems.append(item['crystal system'])

# Get element count statistics
count_distribution = Counter(element_counts)
sorted_counts = sorted(count_distribution.items())

print("Distribution of number of elements in formulas:")
for num_elements, count in sorted_counts:
    print(f"{num_elements} elements: {count} materials ({count/len(temp_data)*100:.2f}%)")

# Get crystal system statistics
crystal_system_distribution = Counter(crystal_systems)
sorted_crystal_systems = sorted(crystal_system_distribution.items())

print("\nDistribution of crystal systems:")
for system, count in sorted_crystal_systems:
    print(f"{system}: {count} materials ({count/len(temp_data)*100:.2f}%)")

print(f"\nTotal number of materials: {len(temp_data)}")


Distribution of number of elements in formulas:
1 elements: 973 materials (76.86%)
2 elements: 293 materials (23.14%)

Distribution of crystal systems:
cubic: 841 materials (66.43%)
hexagonal: 330 materials (26.07%)
orthorhomabic: 67 materials (5.29%)
tetragonal 1: 21 materials (1.66%)
trigonal 1: 7 materials (0.55%)

Total number of materials: 1266
