In [27]:
# from matminer.featurizers.composition.composite import ElementProperty
from pymatgen.core.composition import Composition
import matminer.featurizers.composition as cf
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

ep = cf.ElementProperty.from_preset(preset_name="magpie", impute_nan=True)

In [28]:
import json
from datasets import Dataset
training_set = Dataset.from_json("../prompt_type_4/real_train_dataset_with_mpid.json")
validation_set = Dataset.from_json("../prompt_type_4/real_val_dataset_with_mpid.json")
with open('../prompt_type_4/ec_desc_test_dataset_with_mpid.json') as f:
    test_set = json.load(f)

In [29]:
print(len(training_set))
print(len(validation_set))
print(len(test_set))

9498
500
522


In [30]:
import re

def extract_material_description(text):
    pattern = r"The material(.*?)with"
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    else:
        print(f"Pattern not found in text: {text}")
        return "No description found"

train_desc = [extract_material_description(d['input']) for d in training_set]
val_desc = [extract_material_description(d['input']) for d in validation_set]
test_desc = [extract_material_description(d['input']) for d in test_set]


In [31]:
training_compostion = [Composition(comp) for comp in train_desc]
val_compostion = [Composition(comp) for comp in val_desc]
test_compostion = [Composition(comp) for comp in test_desc]
training_features = ep.featurize_many(training_compostion)
validation_features = ep.featurize_many(val_compostion)
test_features = ep.featurize_many(test_compostion)

ElementProperty: 100%|██████████| 9498/9498 [05:47<00:00, 27.30it/s]
ElementProperty: 100%|██████████| 500/500 [00:13<00:00, 36.38it/s] 
ElementProperty: 100%|██████████| 522/522 [00:16<00:00, 32.34it/s]


In [32]:
import numpy as np

# Find indices where any element in the feature list is NaN
def find_nan_indices(features_list):
    nan_indices = []
    for i, f in enumerate(features_list):
        if np.isnan(f).any():
            nan_indices.append(i)
    
    return nan_indices

print("Indices with NaN in training features:", find_nan_indices(training_features))
print("Indices with NaN in validation features:", find_nan_indices(validation_features))
print("Indices with NaN in test features:", find_nan_indices(test_features))

Indices with NaN in training features: []
Indices with NaN in validation features: []
Indices with NaN in test features: []


In [33]:
import numpy as np
training_elastic_tensor = [np.array(eval(tensor['output'])).flatten() for tensor in training_set]
val_elastic_tensor = [np.array(eval(tensor['output'])).flatten() for tensor in validation_set]
test_elastic_tensor = [np.array(eval(tensor['output'])).flatten() for tensor in test_set]

In [34]:
print(len(training_elastic_tensor))
print(len(val_elastic_tensor))
print(len(test_features))

9498
500
522


In [35]:
import json

# Save training data
training_data = [{
    "material_id": entry["material_id"],
    "features": str(features),
    "elastic_tensor": str(tensor.tolist())
} for entry, features, tensor in zip(training_set, training_features, training_elastic_tensor)]
with open("training_data.json", "w") as f:
    json.dump(training_data, f)

# Save validation data
validation_data = [{
    "material_id": entry["material_id"],
    "features": str(features),
    "elastic_tensor": str(tensor.tolist())
} for entry, features, tensor in zip(validation_set, validation_features, val_elastic_tensor)]
with open("validation_data.json", "w") as f:
    json.dump(validation_data, f)

# Save test data
test_data = [{
    "material_id": entry["material_id"],
    "features": str(features),
    "elastic_tensor": str(tensor.tolist())
} for entry, features, tensor in zip(test_set, test_features, test_elastic_tensor)]
with open("test_data.json", "w") as f:
    json.dump(test_data, f)
