In [1]:
import json
from pymatgen.core.composition import Composition
from torch_geometric.data import Data
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

with open('hea_full.txt', 'r') as f:
    data = json.loads(f.read())

In [2]:
bulk = [dp['Bulk modulus'] for dp in data]
sws = [dp['Wigner-Seitz radius'] for dp in data]
youngs = [dp['Youngs modulus'] for dp in data if 'Youngs modulus' in dp]

In [3]:
elem_properties = {
    'Element': ['Al', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zr', 'Nb', 'Mo', 'Hf', 'Ta', 'W'],
    'Group': [13, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 4, 5, 6],
    'Period': [3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6],
    'Mass': [26.982, 47.867, 50.942, 51.996, 54.938, 55.845, 58.933, 58.693, 63.546, 91.224, 92.906, 95.94, 178.49, 180.95, 183.84],
    'Radius': [143.2, 144.8, 134, 128, 127, 126, 125.3, 124.6, 127.8, 159, 146, 139, 156.4, 146, 139],
    'Electronegativity': [1.61, 1.54, 1.63, 1.66, 1.55, 1.83, 1.88, 1.91, 1.9, 1.33, 1.6, 2.16, 1.3, 1.5, 1.7],
    'IonizationEnergy': [5.986, 6.828, 6.746, 6.767, 7.434, 7.902, 7.881, 7.640, 7.726, 6.634, 6.759, 7.092, 6.825, 7.550, 7.864],
    'ElectronAffinity': [0.441, 0.079, 0.525, 0.666, 0, 0.163, 0.661, 1.156, 1.228, 0.426, 0.893, 0.746, 0.322, 0.322, 0.815],
    'AtomicVolume': [10.00, 10.64, 8.35, 7.23, 7.39, 7.09, 6.70, 6.59, 7.11, 14.02, 10.84, 9.38, 13.60, 10.87, 9.47]
}

elem_properties = pd.DataFrame(elem_properties).set_index('Element')

In [4]:
elem_properties
group_enc = OneHotEncoder().fit(elem_properties['Group'].values.reshape(-1, 1))
period_enc = OneHotEncoder().fit(elem_properties['Period'].values.reshape(-1, 1))

elem_prop_means = elem_properties.mean()
elem_prop_stds = elem_properties.std()

In [5]:
elem_encodings = {el: np.append(np.concatenate([group_enc.transform(elem_properties.loc[el, 'Group'].reshape(-1, 1)).toarray()[0],
                       period_enc.transform(elem_properties.loc[el, 'Period'].reshape(-1, 1)).toarray()[0]]),
                       [(elem_properties.loc[el, prop]-elem_prop_means[prop])/elem_prop_stds[prop] for prop in elem_properties.columns[2:]])
                       for el in elem_properties.index}

In [6]:
def get_node_features(elems):
    return torch.tensor([elem_encodings[el] for el in elems], dtype=torch.float)

# Create graphs
def make_graphs(elements, fractions):
    # Take element lists and fractions, convert to a list of graphs
    graphs = []
    for i, center in enumerate(elements):
        # Edges are represented as a list of pairs (source, target)
        edges = [(center, node) for node in elements if node != center]
        # Convert edges to torch tensor
        edge_index = torch.tensor([[elements.index(source), elements.index(target)] for source, target in edges], dtype=torch.long).t().contiguous()
        
        edge_attr = torch.tensor([fractions[elements.index(target)] for source, target in edges], dtype=torch.float).view(-1, 1)
        
        x = get_node_features(elements)
        
        # Create the graph data object
        graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=fractions[i])
        
        # Append to the list of graphs
        graphs.append(graph)

    return graphs

data_bulk = []
data_sws = []
data_youngs = []

ds_data_bulk = []
ds_data_sws = []
ds_data_youngs = []

for i, dp in enumerate(data):
    bm = bulk[i]
    if bm < 0 or bm > 320:
        continue
    comp = Composition(dp['Composition'])
    elements = [el.symbol for el in comp.elements]
    fractions = [comp[el] for el in elements]

    graphs = make_graphs(elements, fractions)
    data_bulk.append((graphs, bm))
    data_sws.append((graphs, sws[i]))

    x_list = np.stack([elem_encodings[el.symbol]*comp[el] for el in comp.elements]).astype(np.float32)
    ds_data_bulk.append((x_list, bm))
    ds_data_sws.append((x_list, sws[i]))

    if 'Youngs modulus' in dp:
        youngs = dp['Youngs modulus']
        if youngs < 0 or youngs > 900:
            continue
        data_youngs.append((graphs, youngs))
        ds_data_youngs.append((x_list, youngs))


  return torch.tensor([elem_encodings[el] for el in elems], dtype=torch.float)


In [7]:
import pickle
with open('hea_bulk.pkl', 'wb') as f:
    pickle.dump(data_bulk, f)

with open('hea_sws.pkl', 'wb') as f:
    pickle.dump(data_sws, f)

with open('hea_youngs.pkl', 'wb') as f:
    pickle.dump(data_youngs, f)

In [8]:
with open('ds_hea_bulk.pkl', 'wb') as f:
    pickle.dump(ds_data_bulk, f)

with open('ds_hea_sws.pkl', 'wb') as f:
    pickle.dump(ds_data_sws, f)

with open('ds_hea_youngs.pkl', 'wb') as f:
    pickle.dump(ds_data_youngs, f)