In [1]:
import json
# stable and unstable crystal data are downloaded from the Materials Project
# combined data is a combination of stable and unstable data
# xx_with_desc only add the 'description' key, which contains the crystal textual description by using robocrystallographer.
with open("mp_elastic_stable.json") as f:
    data_stable = json.load(f)
with open("mp_elastic_stable_with_desc.json") as f:
    data_stable_desc = json.load(f)
with open("mp_elastic_unstable_with_desc.json") as f:
    data_unstable_desc = json.load(f)
with open("mp_elastic_unstable.json") as f:
    data_unstable = json.load(f)
with open("mp_elastic_combined.json") as f:
    data_combined = json.load(f)

In [2]:
print("Stable data length:", len(data_stable))
print("Stable data with description length:", len(data_stable_desc))
print("Unstable data length:", len(data_unstable))
print("Unstable data with description length:", len(data_unstable_desc))
print("Combined data length:", len(data_combined))
print(6285+5842)

Stable data length: 6285
Stable data with description length: 6285
Unstable data length: 5842
Unstable data with description length: 5842
Combined data length: 12127
12127


In [3]:
def check_database_version(data):
    all_correct = True
    for entry in data:
        if "database_version" not in entry['builder_meta']:
            print("Missing database_version in entry:", entry)
            all_correct = False
        else:
            if entry['builder_meta']["database_version"] != "2023.11.1":
                print("Unexpected database_version in entry:", entry)
                all_correct = False
    if all_correct:
        print("All database versions are 2023.11.1")
check_database_version(data_combined)
check_database_version(data_stable)
check_database_version(data_stable_desc)
check_database_version(data_unstable)
check_database_version(data_unstable_desc)

All database versions are 2023.11.1
All database versions are 2023.11.1
All database versions are 2023.11.1
All database versions are 2023.11.1
All database versions are 2023.11.1


In [4]:
# check whether the materials id is unique for each dataset
def check_materials_id(data):
    materials_ids = set()
    for material in data:
        materials_ids.add(material["material_id"])
    return len(materials_ids) == len(data)
print("Stable data has unique material IDs:", check_materials_id(data_stable))
print("Stable data with description has unique material IDs:", check_materials_id(data_stable_desc))
print("Unstable data has unique material IDs:", check_materials_id(data_unstable))
print("Unstable data with description has unique material IDs:", check_materials_id(data_unstable_desc))
print("Combined data has unique material IDs:", check_materials_id(data_combined))

Stable data has unique material IDs: True
Stable data with description has unique material IDs: True
Unstable data has unique material IDs: True
Unstable data with description has unique material IDs: True
Combined data has unique material IDs: True


In [5]:
stable_data_mp_ids = [data['material_id'] for data in data_stable]
stable_data_mp_ids_desc = [data['material_id'] for data in data_stable_desc]
unstable_data_mp_ids = [data['material_id'] for data in data_unstable]
unstable_data_mp_ids_desc = [data['material_id'] for data in data_unstable_desc]
combined_data_mp_ids = [data['material_id'] for data in data_combined]

In [6]:
# check if stable data mp ids are in combined data mp ids and have the same sequence
def check_stable_mp_ids_in_combined(stable_mp_ids, combined_mp_ids):
    length = len(stable_mp_ids)
    for i in range(length):
        if stable_mp_ids[i] != combined_mp_ids[i]:
            return False
    return True
print("Stable data mp ids are in combined data mp ids and have the same sequence:", check_stable_mp_ids_in_combined(stable_data_mp_ids, combined_data_mp_ids))
def check_unstable_mp_ids_in_combined(unstable_mp_ids, combined_mp_ids):
    length_stable = len(stable_data_mp_ids)
    length = len(unstable_mp_ids)
    for i in range(length):
        if unstable_mp_ids[i] != combined_mp_ids[i+length_stable]:
            return False
    return True
print("Unstable data mp ids are in combined data mp ids and have the same sequence:", check_unstable_mp_ids_in_combined(unstable_data_mp_ids, combined_data_mp_ids))
def check_sequence(data1, data2):
    length = len(data1)
    for i in range(length):
        if data1[i] != data2[i]:
            return False
    return True
print("Stable data mp ids are in stable data with description mp ids and have the same sequence:", check_sequence(stable_data_mp_ids, stable_data_mp_ids_desc))
print("Unstable data mp ids are in unstable data with description mp ids and have the same sequence:", check_sequence(unstable_data_mp_ids, unstable_data_mp_ids_desc))

Stable data mp ids are in combined data mp ids and have the same sequence: True
Unstable data mp ids are in combined data mp ids and have the same sequence: True
Stable data mp ids are in stable data with description mp ids and have the same sequence: True
Unstable data mp ids are in unstable data with description mp ids and have the same sequence: True


In [7]:
import pandas as pd

def save_mp_ids_to_csv(mp_ids, filename):
    """
    Save material IDs to a CSV file.
    
    Args:
        mp_ids (list): List of material IDs
        filename (str): Output CSV filename
    """
    df = pd.DataFrame(mp_ids, columns=['material_id'])
    df.to_csv(filename, index=False)

save_mp_ids_to_csv(stable_data_mp_ids, 'stable_data_mp_ids.csv')
save_mp_ids_to_csv(stable_data_mp_ids_desc, 'stable_data_mp_ids_desc.csv')
save_mp_ids_to_csv(unstable_data_mp_ids, 'unstable_data_mp_ids.csv')
save_mp_ids_to_csv(unstable_data_mp_ids_desc, 'unstable_data_mp_ids_desc.csv')
save_mp_ids_to_csv(combined_data_mp_ids, 'combined_data_mp_ids.csv')

In [8]:
with open("mp_elastic_combined.json", "r") as f:
    data_combined = json.load(f)
with open("../../../reproduce/data/mp_elastic_combined.json", "r") as f:
    data_combined_reproduced = json.load(f) 
    
for i in range(len(data_combined)):
    result = True
    if data_combined[i]['material_id'] != data_combined_reproduced[i]['material_id']:
        print("Materials ID mismatch at index", i)
        result = False
if result:
    print("All materials IDs match between the two datasets.")

All materials IDs match between the two datasets.


In [3]:
from datasets import Dataset
import json
# Check for data leakage
reproduced_training_set_prompt_type1_real = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_train_dataset_with_mpid.json")
reproduced_val_set_prompt_type1_real = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_val_dataset_with_mpid.json")
with open("../../../reproduce/data/prompt_type_1/ec_short_test_dataset_with_mpid.json", "r") as f:
    reproduced_test_set_prompt_type1 = json.load(f)
mp_id_train = [data['material_id'] for data in reproduced_training_set_prompt_type1_real]
mp_id_val = [data['material_id'] for data in reproduced_val_set_prompt_type1_real]
mp_id_test = [data['material_id'] for data in reproduced_test_set_prompt_type1]
print(len(mp_id_train))
print(len(mp_id_val))
print(len(mp_id_test))
# Check overlap between train/val/test sets
train_set = set(mp_id_train)
val_set = set(mp_id_val) 
test_set = set(mp_id_test)
print("Train set size:", len(train_set))
print("Validation set size:", len(val_set))
print("Test set size:", len(test_set))

train_val_overlap = train_set.intersection(val_set)
train_test_overlap = train_set.intersection(test_set)
val_test_overlap = val_set.intersection(test_set)

print("Overlap between train and validation sets:", len(train_val_overlap))
print("Overlap between train and test sets:", len(train_test_overlap))
print("Overlap between validation and test sets:", len(val_test_overlap))


9498
500
522
Train set size: 9498
Validation set size: 500
Test set size: 522
Overlap between train and validation sets: 0
Overlap between train and test sets: 0
Overlap between validation and test sets: 0


In [9]:
import json
# check if the split data in the dft_dataset folder is the same as the data in the reproduce/data folder
def check_split_data(data1, data2):
    """
    Check if the split data in the dft_dataset folder is the same as the split data in the `reproduce/data` folder.
    
    Args:
        data1 (list): First dataset
        data2 (list): Second dataset
    """
    if len(data1) != len(data2):
        print("Length mismatch:", len(data1), len(data2))
        return False
    for i in range(len(data1)):
        if data1[i]['instruction'] != data2[i]['instruction']:
            print("Instruction mismatch at index", i)
            print("Data1:", data1[i]['instruction'])
            print("Data2:", data2[i]['instruction'])
            return False
        if data1[i]['input'] != data2[i]['input']:
            print("Input mismatch at index", i)
            print("Data1:", data1[i]['input'])
            print("Data2:", data2[i]['input'])
            return False
        if data1[i]['output'] != data2[i]['output']:
            print("Output mismatch at index", i)
            print("Data1:", data1[i]['output'])
            print("Data2:", data2[i]['output'])
            return False
    return True

def check_materials_id(data1, data2):
    """
    Check if the materials ID is the same for different prompt types, which helps to check the data consistency.
    
    Args:
        data1 (list): First dataset
        data2 (list): Second dataset
    """
    if len(data1) != len(data2):
        print("Length mismatch:", len(data1), len(data2))
        return False
    for i in range(len(data1)):
        if data1[i]['material_id'] != data2[i]['material_id']:
            print("Materials ID mismatch at index", i)
            return False
    return True

with open("../../../reproduce/data/prompt_type_1/ec_short_train_dataset_with_mpid.json", "r") as f:
    reproduced_training_set_prompt_type1 = json.load(f)
with open("../../../reproduce/data/prompt_type_1/ec_short_test_dataset_with_mpid.json", "r") as f:
    reproduced_test_set_prompt_type1 = json.load(f)
with open("../prompt_type_1/ec_short_train_dataset.json", "r") as f:
    training_set_prompt_type1 = json.load(f)
with open("../prompt_type_1/ec_short_test_dataset.json", "r") as f:
    test_set_prompt_type1 = json.load(f)
    
with open("../../../reproduce/data/prompt_type_2/only_structure_desc_train_with_mpid.json", "r") as f:
    reproduced_training_set_prompt_type2 = json.load(f)
with open("../../../reproduce/data/prompt_type_2/only_structure_desc_test_with_mpid.json", "r") as f:
    reproduced_test_set_prompt_type2 = json.load(f)
with open("../prompt_type_2/only_structure_desc_train.json", "r") as f:
    training_set_prompt_type2 = json.load(f)
with open("../prompt_type_2/only_structure_desc_test.json", "r") as f:
    test_set_prompt_type2 = json.load(f)
    
with open("../../../reproduce/data/prompt_type_3/only_comp_desc_train_with_mpid.json", "r") as f:
    reproduced_training_set_prompt_type3 = json.load(f)
with open("../../../reproduce/data/prompt_type_3/only_comp_desc_test_with_mpid.json", "r") as f:
    reproduced_test_set_prompt_type3 = json.load(f)
with open("../prompt_type_3/only_comp_desc_train.json", "r") as f:
    training_set_prompt_type3 = json.load(f)
with open("../prompt_type_3/only_comp_desc_test.json", "r") as f:
    test_set_prompt_type3 = json.load(f)

with open("../../../reproduce/data/prompt_type_4/ec_desc_train_dataset_with_mpid.json", "r") as f:
    reproduced_training_set_prompt_type4 = json.load(f)
with open("../../../reproduce/data/prompt_type_4/ec_desc_test_dataset_with_mpid.json", "r") as f:
    reproduced_test_set_prompt_type4 = json.load(f)
with open("../prompt_type_4/ec_desc_train_dataset.json", "r") as f:
    training_set_prompt_type4 = json.load(f)
with open("../prompt_type_4/ec_desc_test_dataset.json", "r") as f:
    test_set_prompt_type4 = json.load(f)

if check_split_data(training_set_prompt_type1, reproduced_training_set_prompt_type1):
    print("Training set prompt type 1 data is the same as the reproduced training set prompt type 1 data.")
else:
    print("Training set prompt type 1 data is not the same as the reproduced training set prompt type 1 data.")
if check_split_data(test_set_prompt_type1, reproduced_test_set_prompt_type1):
    print("Test set prompt type 1 data is the same as the reproduced test set prompt type 1 data.")
else:
    print("Test set prompt type 1 data is not the same as the reproduced test set prompt type 1 data.")
    
if check_split_data(training_set_prompt_type2, reproduced_training_set_prompt_type2):
    print("Training set prompt type 2 data is the same as the reproduced training set prompt type 2 data.")
else:
    print("Training set prompt type 2 data is not the same as the reproduced training set prompt type 2 data.")
if check_split_data(test_set_prompt_type2, reproduced_test_set_prompt_type2):
    print("Test set prompt type 2 data is the same as the reproduced test set prompt type 2 data.")
else:
    print("Test set prompt type 2 data is not the same as the reproduced test set prompt type 2 data.")

if check_split_data(training_set_prompt_type3, reproduced_training_set_prompt_type3):
    print("Training set prompt type 3 data is the same as the reproduced training set prompt type 3 data.")
else:
    print("Training set prompt type 3 data is not the same as the reproduced training set prompt type 3 data.")
if check_split_data(test_set_prompt_type3, reproduced_test_set_prompt_type3):
    print("Test set prompt type 3 data is the same as the reproduced test set prompt type 3 data.")
else:
    print("Test set prompt type 3 data is not the same as the reproduced test set prompt type 3 data.")
    
if check_split_data(training_set_prompt_type4, reproduced_training_set_prompt_type4):
    print("Training set prompt type 4 data is the same as the reproduced training set prompt type 4 data.")
else:
    print("Training set prompt type 4 data is not the same as the reproduced training set prompt type 4 data.")
if check_split_data(test_set_prompt_type4, reproduced_test_set_prompt_type4):
    print("Test set prompt type 4 data is the same as the reproduced test set prompt type 4 data.")
else:
    print("Test set prompt type 4 data is not the same as the reproduced test set prompt type 4 data.")
    
if check_materials_id(reproduced_training_set_prompt_type1, reproduced_training_set_prompt_type2):
    print("Training set prompt type 1 and prompt type 2 data have the same materials ID.")
else:
    print("Training set prompt type 1 and prompt type 2 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type1, reproduced_training_set_prompt_type3):
    print("Training set prompt type 1 and prompt type 3 data have the same materials ID.")
else:
    print("Training set prompt type 1 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type1, reproduced_training_set_prompt_type4):
    print("Training set prompt type 1 and prompt type 4 data have the same materials ID.")
else:
    print("Training set prompt type 1 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type2, reproduced_training_set_prompt_type3):
    print("Training set prompt type 2 and prompt type 3 data have the same materials ID.")
else:
    print("Training set prompt type 2 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type2, reproduced_training_set_prompt_type4):
    print("Training set prompt type 2 and prompt type 4 data have the same materials ID.")
else:
    print("Training set prompt type 2 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type3, reproduced_training_set_prompt_type4):
    print("Training set prompt type 3 and prompt type 4 data have the same materials ID.")
else:
    print("Training set prompt type 3 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type1, reproduced_test_set_prompt_type2):
    print("Test set prompt type 1 and prompt type 2 data have the same materials ID.")
else:
    print("Test set prompt type 1 and prompt type 2 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type1, reproduced_test_set_prompt_type3):
    print("Test set prompt type 1 and prompt type 3 data have the same materials ID.")
else:
    print("Test set prompt type 1 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type1, reproduced_test_set_prompt_type4):
    print("Test set prompt type 1 and prompt type 4 data have the same materials ID.")
else:
    print("Test set prompt type 1 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type2, reproduced_test_set_prompt_type3):
    print("Test set prompt type 2 and prompt type 3 data have the same materials ID.")
else:
    print("Test set prompt type 2 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type2, reproduced_test_set_prompt_type4):
    print("Test set prompt type 2 and prompt type 4 data have the same materials ID.")
else:
    print("Test set prompt type 2 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_test_set_prompt_type3, reproduced_test_set_prompt_type4):
    print("Test set prompt type 3 and prompt type 4 data have the same materials ID.")
else:
    print("Test set prompt type 3 and prompt type 4 data do not have the same materials ID.")
    
# check the mp_ids in the real training and validation datasets for different prompt types
from datasets import Dataset

reproduced_training_set_prompt_type1_real = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_train_dataset_with_mpid.json")
reproduced_val_set_prompt_type1_real = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_val_dataset_with_mpid.json")
reproduced_training_set_prompt_type2_real = Dataset.from_json("../../../reproduce/data/prompt_type_2/real_train_dataset_with_mpid.json")
reproduced_val_set_prompt_type2_real = Dataset.from_json("../../../reproduce/data/prompt_type_2/real_val_dataset_with_mpid.json")
reproduced_training_set_prompt_type3_real = Dataset.from_json("../../../reproduce/data/prompt_type_3/real_train_dataset_with_mpid.json")
reproduced_val_set_prompt_type3_real = Dataset.from_json("../../../reproduce/data/prompt_type_3/real_val_dataset_with_mpid.json")
reproduced_training_set_prompt_type4_real = Dataset.from_json("../../../reproduce/data/prompt_type_4/real_train_dataset_with_mpid.json")
reproduced_val_set_prompt_type4_real = Dataset.from_json("../../../reproduce/data/prompt_type_4/real_val_dataset_with_mpid.json")

if check_materials_id(reproduced_training_set_prompt_type1_real, reproduced_training_set_prompt_type2_real):
    print("Real training set prompt type 1 and prompt type 2 data have the same materials ID.")
else:
    print("Real training set prompt type 1 and prompt type 2 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type1_real, reproduced_training_set_prompt_type3_real):
    print("Real training set prompt type 1 and prompt type 3 data have the same materials ID.")
else:
    print("Real training set prompt type 1 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type1_real, reproduced_training_set_prompt_type4_real):
    print("Real training set prompt type 1 and prompt type 4 data have the same materials ID.")
else:
    print("Real training set prompt type 1 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type2_real, reproduced_training_set_prompt_type3_real):
    print("Real training set prompt type 2 and prompt type 3 data have the same materials ID.")
else:
    print("Real training set prompt type 2 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type2_real, reproduced_training_set_prompt_type4_real):
    print("Real training set prompt type 2 and prompt type 4 data have the same materials ID.")
else:
    print("Real training set prompt type 2 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_training_set_prompt_type3_real, reproduced_training_set_prompt_type4_real):
    print("Real training set prompt type 3 and prompt type 4 data have the same materials ID.")
else:
    print("Real training set prompt type 3 and prompt type 4 data do not have the same materials ID.")
    
if check_materials_id(reproduced_val_set_prompt_type1_real, reproduced_val_set_prompt_type2_real):
    print("Real validation set prompt type 1 and prompt type 2 data have the same materials ID.")
else:
    print("Real validation set prompt type 1 and prompt type 2 data do not have the same materials ID.")
if check_materials_id(reproduced_val_set_prompt_type1_real, reproduced_val_set_prompt_type3_real):
    print("Real validation set prompt type 1 and prompt type 3 data have the same materials ID.")
else:
    print("Real validation set prompt type 1 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_val_set_prompt_type1_real, reproduced_val_set_prompt_type4_real):
    print("Real validation set prompt type 1 and prompt type 4 data have the same materials ID.")
else:
    print("Real validation set prompt type 1 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_val_set_prompt_type2_real, reproduced_val_set_prompt_type3_real):
    print("Real validation set prompt type 2 and prompt type 3 data have the same materials ID.")
else:
    print("Real validation set prompt type 2 and prompt type 3 data do not have the same materials ID.")
if check_materials_id(reproduced_val_set_prompt_type2_real, reproduced_val_set_prompt_type4_real):
    print("Real validation set prompt type 2 and prompt type 4 data have the same materials ID.")
else:
    print("Real validation set prompt type 2 and prompt type 4 data do not have the same materials ID.")
if check_materials_id(reproduced_val_set_prompt_type3_real, reproduced_val_set_prompt_type4_real):
    print("Real validation set prompt type 3 and prompt type 4 data have the same materials ID.")
else:
    print("Real validation set prompt type 3 and prompt type 4 data do not have the same materials ID.")

Training set prompt type 1 data is the same as the reproduced training set prompt type 1 data.
Test set prompt type 1 data is the same as the reproduced test set prompt type 1 data.
Training set prompt type 2 data is the same as the reproduced training set prompt type 2 data.
Test set prompt type 2 data is the same as the reproduced test set prompt type 2 data.
Training set prompt type 3 data is the same as the reproduced training set prompt type 3 data.
Test set prompt type 3 data is the same as the reproduced test set prompt type 3 data.
Training set prompt type 4 data is the same as the reproduced training set prompt type 4 data.
Test set prompt type 4 data is the same as the reproduced test set prompt type 4 data.
Training set prompt type 1 and prompt type 2 data have the same materials ID.
Training set prompt type 1 and prompt type 3 data have the same materials ID.
Training set prompt type 1 and prompt type 4 data have the same materials ID.
Training set prompt type 2 and prompt 

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 9498 examples [00:00, 182170.08 examples/s]
Generating train split: 500 examples [00:00, 116495.50 examples/s]
Generating train split: 9498 examples [00:00, 514072.05 examples/s]
Generating train split: 500 examples [00:00, 151102.53 examples/s]
Generating train split: 9498 examples [00:00, 485136.87 examples/s]
Generating train split: 500 examples [00:00, 81629.83 examples/s]
Generating train split: 9498 examples [00:00, 342281.85 examples/s]
Generating train split: 500 examples [00:00, 84600.10 examples/s]


Real training set prompt type 1 and prompt type 2 data have the same materials ID.
Real training set prompt type 1 and prompt type 3 data have the same materials ID.
Real training set prompt type 1 and prompt type 4 data have the same materials ID.
Real training set prompt type 2 and prompt type 3 data have the same materials ID.
Real training set prompt type 2 and prompt type 4 data have the same materials ID.
Real training set prompt type 3 and prompt type 4 data have the same materials ID.
Real validation set prompt type 1 and prompt type 2 data have the same materials ID.
Real validation set prompt type 1 and prompt type 3 data have the same materials ID.
Real validation set prompt type 1 and prompt type 4 data have the same materials ID.
Real validation set prompt type 2 and prompt type 3 data have the same materials ID.
Real validation set prompt type 2 and prompt type 4 data have the same materials ID.
Real validation set prompt type 3 and prompt type 4 data have the same materi

In [10]:
import json
from datasets import Dataset
# check the splitted datasets of matten with prompt methods after we revised the matten datasets
with open("../../../reproduce/data/matten/train_dataset_with_mpid.json", "r") as f:
    matten_train_data = json.load(f)
with open("../../../reproduce/data/matten/validation_dataset_with_mpid.json", "r") as f:
    matten_val_data = json.load(f)
with open("../../../reproduce/data/matten/test_dataset_with_mpid.json", "r") as f:
    matten_test_data = json.load(f)
    
matten_train_data_mpids = list(matten_train_data['mpid'].values())
matten_val_data_mpids = list(matten_val_data['mpid'].values())
matten_test_data = list(matten_test_data['mpid'].values())

training_set = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_train_dataset_with_mpid.json")
validation_set = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_val_dataset_with_mpid.json")
with open('../../../reproduce/data/prompt_type_1/ec_short_test_dataset_with_mpid.json', "r") as file:
    test_set = json.load(file)
    
training_set_mpids = [k['material_id'] for k in training_set]
validation_set_mpids = [k['material_id'] for k in validation_set]
test_set_mpids = [k['material_id'] for k in test_set]

def check_materials_id(data1, data2):
    if len(data1) != len(data2):
        print("Length mismatch:", len(data1), len(data2))
        return False
    for i in range(len(data1)):
        if data1[i] not in data2:
            print("Materials ID mismatch at index", i)
            print("Data1:", data1[i])
            return False
    return True
if check_materials_id(matten_train_data_mpids, training_set_mpids):
    print("Training set data has the same materials ID as matten training set data.")
else:
    print("Training set data does not have the same materials ID as matten training set data.")
if check_materials_id(matten_val_data_mpids, validation_set_mpids):
    print("Validation set data has the same materials ID as matten validation set data.")
else:
    print("Validation set data does not have the same materials ID as matten validation set data.")
if check_materials_id(matten_test_data, test_set_mpids):
    print("Test set data has the same materials ID as matten test set data.")
else:
    print("Test set data does not have the same materials ID as matten test set data.")

Training set data has the same materials ID as matten training set data.
Validation set data has the same materials ID as matten validation set data.
Test set data has the same materials ID as matten test set data.


In [11]:
# check the splitted datasets of random-forest with prompt methods after we revised the random forest datasets
with open("../../../reproduce/data/random_forest/training_data.json", "r") as f:
    rf_train_data = json.load(f)
with open("../../../reproduce/data/random_forest/validation_data.json", "r") as f:
    rf_val_data = json.load(f)
with open("../../../reproduce/data/random_forest/test_data.json", "r") as f:
    rf_test_data = json.load(f)
rf_train_data_mpids = [k['material_id'] for k in rf_train_data]
rf_val_data_mpids = [k['material_id'] for k in rf_val_data]
rf_test_data_mpids = [k['material_id'] for k in rf_test_data]
    
training_set = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_train_dataset_with_mpid.json")
validation_set = Dataset.from_json("../../../reproduce/data/prompt_type_1/real_val_dataset_with_mpid.json")
with open('../../../reproduce/data/prompt_type_1/ec_short_test_dataset_with_mpid.json', "r") as file:
    test_set = json.load(file)
    
training_set_mpids = [k['material_id'] for k in training_set]
validation_set_mpids = [k['material_id'] for k in validation_set]
test_set_mpids = [k['material_id'] for k in test_set]

def check_materials_id(data1, data2):
    if len(data1) != len(data2):
        print("Length mismatch:", len(data1), len(data2))
        return False
    for i in range(len(data1)):
        if data1[i] not in data2:
            print("Materials ID mismatch at index", i)
            print("Data1:", data1[i])
            return False
    return True

if check_materials_id(rf_train_data_mpids, training_set_mpids):
    print("Training set data has the same materials ID as random forest training set data.")
else:
    print("Training set data does not have the same materials ID as random forest training set data.")
if check_materials_id(rf_val_data_mpids, validation_set_mpids):
    print("Validation set data has the same materials ID as random forest validation set data.")
else:
    print("Validation set data does not have the same materials ID as random forest validation set data.")
if check_materials_id(rf_test_data_mpids, test_set_mpids):
    print("Test set data has the same materials ID as random forest test set data.")
else:
    print("Test set data does not have the same materials ID as random forest test set data.")

Training set data has the same materials ID as random forest training set data.
Validation set data has the same materials ID as random forest validation set data.
Test set data has the same materials ID as random forest test set data.


In [12]:
# check reproduced mixed_dataset
import json
with open("../../../reproduce/data/mixed_dataset/combined_data.json", "r") as f:
    reproduced_mixed_train_data = json.load(f)
with open("../../mixed_dataset/combined_data.json", "r") as f:
    mixed_train_data = json.load(f)
def check_content(data1, data2):
    """
    Check if all keys and values in two datasets are consistent.
    
    Parameters:
    data1 -- First dataset (list of dictionaries)
    data2 -- Second dataset (list of dictionaries)
    
    Returns:
    bool -- True if all data matches, False otherwise
    """
    if len(data1) != len(data2):
        print(f"Data length mismatch: {len(data1)} vs {len(data2)}")
        return False
        
    all_match = True
    for i, (item1, item2) in enumerate(zip(data1, data2)):
        # Check if keys match
        if set(item1.keys()) != set(item2.keys()):
            print(f"Key set mismatch at index {i}")
            print(f"Keys in data1 but not in data2: {set(item1.keys()) - set(item2.keys())}")
            print(f"Keys in data2 but not in data1: {set(item2.keys()) - set(item1.keys())}")
            all_match = False
            continue
            
        # Check if values match for each key
        for key in item1.keys():
            if item1[key] != item2[key]:
                print(f"Value mismatch at index {i} for key '{key}'")
                all_match = False
                
    return all_match

# Usage example
if check_content(reproduced_mixed_train_data, mixed_train_data):
    print("All data matches perfectly!")
else:
    print("Mismatches found.")
    

All data matches perfectly!
