In [None]:
import sys

if "google.colab" in sys.modules:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # Install required packages
    !pip install ase torch_geometric
    import torch
    from torch_geometric.data import Data, DataLoader

    dataset = "/content/drive/My Drive/Dataset"
else:
    dataset = "Dataset"

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_df = pd.read_pickle(f"{dataset}/absorption_mp_data.pkl")
data_df.head()

In [None]:
def get_mean(row):
    '''row["abs_coeff_mean"] = np.average(row["absorption_coefficient"])
    row["energies_mean"] = np.average(row["energies"])
    row["imag_dielectric_mean"] = np.average(row["imag_dielectric"])
    row["real_dielectric_mean"] = np.average(row["real_dielectric"])'''
    row["unit_cell_volume"] = row["structure"].get_volume()
    return row

data_df = data_df.apply(lambda row: get_mean(row), axis=1)


In [None]:
# Find correlations
'''
data_copy = data_df.copy()
data_copy= data_copy.drop(["id", "formula", "structure", "absorption_coefficient", "energies", "imag_dielectric", "real_dielectric"], axis=1)
sample_corr = data_copy.corr()
sample_corr'''

### Split the data into train, validation, and test sets

In [None]:
from sklearn.model_selection import train_test_split

train, the_test = train_test_split(data_df, test_size=0.2, random_state=42)

val,testing = train_test_split(the_test, test_size=0.5, random_state=42)

In [None]:
# The structure column is already an atom
a_structure = data_df["structure"][1]
print(a_structure)

In [None]:
# What can we do with the whole structure?
print(a_structure.get_all_distances())
#print(a_structure.get_cell())
print(a_structure.get_chemical_formula())
#print(a_structure.get_atomic_numbers())
print(a_structure.get_chemical_symbols())
#print(a_structure.get_distance(a0=0, a1=1))
#print(a_structure. get_distances(0,2))
#print(a_structure.get_global_number_of_atoms())
#print(a_structure.get_initial_charges())
#print(a_structure.get_kinetic_energy())
#print(a_structure.get_pbc())
print(a_structure.get_positions())
print(a_structure.get_volume())
#print(a_structure.get_masses()) # Get atomic mass of each atom in the structure
#print(a_structure.get_momenta()) # Get the momenta of each atom in the structure
#print(a_structure.get_moments_of_inertia()) # Get the moments of inertia in the structure
#print(a_structure.get_angular_momentum())
#print(a_structure.get_initial_magnetic_moments())
#print(a_structure.get_atomic_numbers())
#print(a_structure.get_tags())
#print(a_structure.get_angle(0,1,2))

## Get nodes, edges, global_attributes, target_attributes

In [None]:
# Get nodes of structure
from ase.data import atomic_masses, covalent_radii
#from ase.data.polarizabilities import atomic_polarizability 

all_nodes = []
for i in a_structure:
    Z = i.number
    # Atomic mass (from ASE database)
    mass = atomic_masses[Z]
    
    # Covalent radius (from ASE database)
    radius = covalent_radii[Z]
    
    # Dipole polarizability (from separate module or custom dictionary)
    #polarizability = atomic_polarizability.get(Z, 0.0)  
    the_nodes = [Z, mass, radius]
    all_nodes.append(the_nodes)

print(all_nodes)

In [None]:
# Get total number of atoms in the structure
num_atoms = a_structure.get_global_number_of_atoms()

edges = []

from_node = []
to_node = []
for i in range(num_atoms):
    for j in range(num_atoms):
        if i==j:
            pass
        else:
            from_node.append(i)
            to_node.append(j)

edges=[from_node, to_node]
print(edges)


In [None]:
# Get global attributes
an_index = 0
global_att = [data_df["unit_cell_volume"][an_index]]
print(global_att)

In [None]:
# Get the target values
targ = data_df["absorption_coefficient"][an_index]
print(targ)

## Now as a function that will iterate through the whole dataset

In [None]:
from ase.data import atomic_masses, covalent_radii
def get_graphical(row):
    # Get nodes of structure
    all_nodes = []
    for i in row["structure"]:
        Z = i.number
        # Atomic mass (from ASE database)
        mass = atomic_masses[Z]
        
        # Covalent radius (from ASE database)
        radius = covalent_radii[Z]
        
        the_nodes = [Z, mass, radius]
        all_nodes.append(the_nodes)

    # Get edges        
    num_atoms = row["structure"].get_global_number_of_atoms()

    edges = []

    from_node = []
    to_node = []

    for i in range(num_atoms):
        for j in range(num_atoms):
            if i==j:
                pass
            else:
                from_node.append(i)
                to_node.append(j)

    edges=[from_node, to_node]

    # Get global attributes
    global_att = [row["unit_cell_volume"]]

    # Get target attribute
    target_att = [row["absorption_coefficient"]]

    # Represent all these features as tensor objects
    the_data = Data(x=torch.tensor(all_nodes, dtype=float), edge_index=torch.tensor(edges, dtyype= long), 
                    u=torch.tensor(global_att, dtype= float), y=torch.tensor(target_att, dtype=float))
    return the_data
    # return [all_nodes, edges, global_att, target_att]

sample = data_df.apply(lambda row: get_graphical(row), axis=1)
print(sample[0])

## Save the data 

In [None]:
torch.save(sample, f"{dataset}/the_data.pt")

## Split the data