In [15]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # Install required packages
    !pip install ase torch_geometric
    import torch
    from torch_geometric.data import Data, DataLoader

    dataset = "/content/drive/My Drive/Dataset"
else:
    dataset = "Dataset"

In [16]:
import pandas as pd
import numpy as np

In [17]:
data_df = pd.read_pickle(f"{dataset}/absorption_mp_data.pkl")
data_df.head()

Unnamed: 0,id,formula,structure,energies,absorption_coefficient,imag_dielectric,real_dielectric,energy_max,bandgap
0,mp-546266,DyBi2IO4,"(Atom('Dy', [np.float64(0.0), np.float64(0.0),...","[0.0, 0.0277, 0.0554, 0.0831, 0.1109, 0.1386, ...","[0.0, 0.8362304735329499, 3.309178082718349, 7...","[0.0, 0.0015666666666666665, 0.0031, 0.0046666...","[6.9163, 6.916533333333334, 6.9172, 6.9182, 6....",55.4328,1.3818
1,mp-9583,K2ZnF4,"(Atom('K', [np.float64(1.2531985199117757), np...","[0.0, 0.0399, 0.0798, 0.1197, 0.1596, 0.1995, ...","[0.0, 0.1362578822994739, 0.4541860898852432, ...","[0.0, 0.0001, 0.00016666666666666666, 0.0002, ...","[2.2021333333333333, 2.202166666666667, 2.2022...",79.7804,4.3866
2,mp-22988,CsGeCl3,"(Atom('Cs', [np.float64(7.78333021211009), np....","[0.0, 0.029, 0.058, 0.087, 0.116, 0.1449, 0.17...","[0.0, 0.515935972729374, 2.0145311056971553, 4...","[0.0, 0.0007, 0.0013666666666666669, 0.0020333...","[3.975733333333333, 3.9758333333333336, 3.9761...",57.9759,2.178
3,mp-861502,AcFeO3,"(Atom('Ac', [np.float64(0.0), np.float64(0.0),...","[0.0, 0.0587, 0.1174, 0.176, 0.2347, 0.2934, 0...","[0.0, 11295.053197761259, 44864.495977063896, ...","[0.0, 30.1914, 49.9196, 37.98413333333333, 21....","[67.54320000000001, 59.6215, 29.6067, 1.281, -...",117.3625,0.0
4,mp-1025029,PrHSe,"(Atom('Pr', [np.float64(-2.0265362), np.float6...","[0.0, 0.0365, 0.073, 0.1095, 0.1459, 0.1824, 0...","[0.0, 1.6849892209331498, 6.697792903781345, 1...","[0.0, 0.0026999999999999997, 0.005366666666666...","[8.784633333333334, 8.7851, 8.786566666666667,...",72.9718,1.9456


In [18]:
def get_mean(row):
    '''row["abs_coeff_mean"] = np.average(row["absorption_coefficient"])
    row["energies_mean"] = np.average(row["energies"])
    row["imag_dielectric_mean"] = np.average(row["imag_dielectric"])
    row["real_dielectric_mean"] = np.average(row["real_dielectric"])'''
    row["unit_cell_volume"] = row["structure"].get_volume()
    return row

data_df = data_df.apply(lambda row: get_mean(row), axis=1)

In [19]:
# Find correlations
'''
data_copy = data_df.copy()
data_copy= data_copy.drop(["id", "formula", "structure", "absorption_coefficient", "energies", "imag_dielectric", "real_dielectric"], axis=1)
sample_corr = data_copy.corr()
sample_corr'''

'\ndata_copy = data_df.copy()\ndata_copy= data_copy.drop(["id", "formula", "structure", "absorption_coefficient", "energies", "imag_dielectric", "real_dielectric"], axis=1)\nsample_corr = data_copy.corr()\nsample_corr'

### Split the data into train, validation, and test sets

In [None]:
from sklearn.model_selection import train_test_split

train, the_test = train_test_split(data_df, test_size=0.2, random_state=42)

val,testing = train_test_split(the_test, test_size=0.5, random_state=42)

In [20]:
# The structure column is already an atom
a_structure = data_df["structure"][1]
print(a_structure)

Atoms(symbols='K2ZnF4', pbc=True, cell=[[3.87855986, 0.0, -1.1469436], [-0.33916743, 3.86370258, -1.1469436], [-0.00357393, -0.00390187, 7.11934679]])


In [21]:
# What can we do with the whole structure?
print(a_structure.get_all_distances())
#print(a_structure.get_cell())
print(a_structure.get_chemical_formula())
#print(a_structure.get_atomic_numbers())
print(a_structure.get_chemical_symbols())
#print(a_structure.get_distance(a0=0, a1=1))
#print(a_structure. get_distances(0,2))
#print(a_structure.get_global_number_of_atoms())
#print(a_structure.get_initial_charges())
#print(a_structure.get_kinetic_energy())
#print(a_structure.get_pbc())
print(a_structure.get_positions())
print(a_structure.get_volume())
#print(a_structure.get_masses()) # Get atomic mass of each atom in the structure
#print(a_structure.get_momenta()) # Get the momenta of each atom in the structure
#print(a_structure.get_moments_of_inertia()) # Get the moments of inertia in the structure
#print(a_structure.get_angular_momentum())
#print(a_structure.get_initial_magnetic_moments())
#print(a_structure.get_atomic_numbers())
#print(a_structure.get_tags())
#print(a_structure.get_angle(0,1,2))

[[0.         3.95563414 4.62617937 5.04888136 5.04888143 2.86302
  2.60031286]
 [3.95563414 0.         3.4299493  2.77035814 2.77035784 2.60031286
  2.86302   ]
 [4.62617937 3.4299493  0.         2.02229487 2.02229522 5.3266708
  2.02586651]
 [5.04888136 2.77035814 2.02229487 0.         2.85995722 4.92785317
  2.86248483]
 [5.04888143 2.77035784 2.02229522 2.85995722 0.         4.92785296
  2.86248502]
 [2.86302    2.60031286 5.3266708  4.92785317 4.92785296 0.
  3.77756265]
 [2.60031286 2.86302    2.02586651 2.86248483 2.86248502 3.77756265
  0.        ]]
F4K2Zn
['K', 'K', 'Zn', 'F', 'F', 'F', 'F']
[[ 1.25319852  1.36802722  4.23786864]
 [ 2.28261998  2.49177349  0.58759095]
 [ 0.          0.          0.        ]
 [ 1.93927993  0.         -0.5734718 ]
 [-0.16958371  1.93185129 -0.5734718 ]
 [ 2.98702846  3.26072587  2.96963795]
 [ 0.54879004  0.59907484  1.85582164]]
106.65298266178146


## Get nodes, edges, global_attributes, target_attributes

In [9]:
# Get nodes of structure
from ase.data import atomic_masses, covalent_radii
#from ase.data.polarizabilities import atomic_polarizability 

all_nodes = []
for i in a_structure:
    Z = i.number
    # Atomic mass (from ASE database)
    mass = atomic_masses[Z]
    
    # Covalent radius (from ASE database)
    radius = covalent_radii[Z]
    
    # Dipole polarizability (from separate module or custom dictionary)
    #polarizability = atomic_polarizability.get(Z, 0.0)  
    the_nodes = [Z, mass, radius]
    all_nodes.append(the_nodes)

print(all_nodes)

[[np.int64(19), np.float64(39.0983), np.float64(2.03)], [np.int64(19), np.float64(39.0983), np.float64(2.03)], [np.int64(30), np.float64(65.38), np.float64(1.22)], [np.int64(9), np.float64(18.998403163), np.float64(0.57)], [np.int64(9), np.float64(18.998403163), np.float64(0.57)], [np.int64(9), np.float64(18.998403163), np.float64(0.57)], [np.int64(9), np.float64(18.998403163), np.float64(0.57)]]


In [None]:
# Get total number of atoms in the structure
num_atoms = a_structure.get_global_number_of_atoms()

edges = []

from_node = []
to_node = []
for i in range(num_atoms):
    for j in range(num_atoms):
        if i==j:
            pass
        else:
            from_node.append(i)
            to_node.append(j)

edges=[from_node, to_node]
print(edges)

[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6], [1, 2, 3, 4, 5, 6, 0, 2, 3, 4, 5, 6, 0, 1, 3, 4, 5, 6, 0, 1, 2, 4, 5, 6, 0, 1, 2, 3, 5, 6, 0, 1, 2, 3, 4, 6, 0, 1, 2, 3, 4, 5]]


In [15]:
# Get global attributes
an_index = 0
global_att = [data_df["unit_cell_volume"][an_index]]
print(global_att)

[np.float64(151.13454136559216)]


In [None]:
# Get the target values
targ = data_df["absorption_coefficient"]
print(targ)

[0.00000000e+00 8.36230474e-01 3.30917808e+00 ... 1.81631330e+03
 1.81701042e+03 1.81774592e+03]


## Now as a function that will iterate through the whole dataset

In [None]:
from ase.data import atomic_masses, covalent_radii
def get_graphical(row):
    # Get nodes of structure
    all_nodes = []
    for i in row["structure"]:
        Z = i.number
        # Atomic mass (from ASE database)
        mass = atomic_masses[Z]
        
        # Covalent radius (from ASE database)
        radius = covalent_radii[Z]
        
        the_nodes = [Z, mass, radius]
        all_nodes.append(the_nodes)

    # Get edges        
    num_atoms = row["structure"].get_global_number_of_atoms()

    edges = []

    from_node = []
    to_node = []

    for i in range(num_atoms):
        for j in range(num_atoms):
            if i==j:
                pass
            else:
                from_node.append(i)
                to_node.append(j)

    edges=[from_node, to_node]

    # Get global attributes
    global_att = [row["unit_cell_volume"]]

    # Get target attribute
    target_att = [row["absorption_coefficient"]]

    # Represent all these features as tensor objects
    the_data = Data(x=torch.tensor(all_nodes, dtype=float), edge_index=torch.tensor(edges, dtyype= long), 
                    u=torch.tensor(global_att, dtype= float), y=torch.tensor(target_att, dtype=float))
    return the_data
    # return [all_nodes, edges, global_att, target_att]

sample = data_df.apply(lambda row: get_graphical(row), axis=1)
print(sample[0])

## Save the data 

In [None]:
torch.save(sample, f"{dataset}/the_data.pt")

## Split the data