In [None]:
import sys
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    !pip install ase torch_geometric
    import torch
    from torch_geometric.data import Data, DataLoader

    dataset = "/content/drive/My Drive/Dataset"
else:
    dataset = "Dataset"

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_df = pd.read_pickle(f"{dataset}/absorption_mp_data.pkl")
data_df

## Reading the dataset per a structure

In [None]:
# The structure column is already an atom
a_structure = data_df["structure"][1]
print(a_structure)

## Adding unit_cell_volume row in the whole dataset{Column}

In [None]:
def get_mean(row):
    row["unit_cell_volume"] = row["structure"].get_volume()
    return row

data_df = data_df.apply(lambda row: get_mean(row), axis=1)

## What the whole structure gives us

In [None]:
print(a_structure.get_all_distances())
#print(a_structure.get_cell())
print(a_structure.get_chemical_formula())
#print(a_structure.get_atomic_numbers())
print(a_structure.get_chemical_symbols())
#print(a_structure.get_distance(a0=0, a1=1))
#print(a_structure. get_distances(0,2))
#print(a_structure.get_global_number_of_atoms())
#print(a_structure.get_initial_charges())
#print(a_structure.get_kinetic_energy())
#print(a_structure.get_pbc())
print(a_structure.get_positions())
print(a_structure.get_volume())
#print(a_structure.get_masses()) # Get atomic mass of each atom in the structure
#print(a_structure.get_momenta()) # Get the momenta of each atom in the structure
#print(a_structure.get_moments_of_inertia()) # Get the moments of inertia in the structure
#print(a_structure.get_angular_momentum())
#print(a_structure.get_initial_magnetic_moments())
#print(a_structure.get_atomic_numbers())
#print(a_structure.get_tags())
#print(a_structure.get_angle(0,1,2))

## Atoms

In [None]:
import ase
from ase.data import atomic_masses, covalent_radii
all_nodes = []
for i in a_structure:
    A = i.number
    mass = atomic_masses[A]
    radius = covalent_radii[A]
    the_nodes = [A, mass, radius]
    all_nodes.append(the_nodes)

print(all_nodes)

## Interatomic connection

In [None]:
num_atoms = a_structure.get_global_number_of_atoms()
edges = []
from_node = []
to_node = []
for i in range(num_atoms):
    for j in range(num_atoms):
        if i==j:
            pass
        else:
            from_node.append(i)
            to_node.append(j)

edges = [from_node, to_node]
print(edges)

## Global Attribute

In [None]:
an_index = 0
global_attr = [data_df["unit_cell_volume"][an_index]]
print(global_attr)

## Target Attribute

In [None]:
targg = [data_df["absorption_coefficient"]]
print(targg)

## Now as a function that will iterate through the whole dataset

In [None]:
from ase.data import atomic_masses, covalent_radii
def get_graphical(row):
    # Get nodes of structure
    all_nodes = []
    for i in row["structure"]:
        Z = i.number
        # Atomic mass (from ASE database)
        mass = atomic_masses[Z]
        
        # Covalent radius (from ASE database)
        radius = covalent_radii[Z]
        
        the_nodes = [Z, mass, radius]
        all_nodes.append(the_nodes)

    # Get edges        
    num_atoms = row["structure"].get_global_number_of_atoms()

    edges = []

    from_node = []
    to_node = []

    for i in range(num_atoms):
        for j in range(num_atoms):
            if i==j:
                pass
            else:
                from_node.append(i)
                to_node.append(j)

    edges=[from_node, to_node]

    # Get global attributes
    global_attr = [row["unit_cell_volume"]]

    # Get target attribute
    targg = [row["absorption_coefficient"]]
    sample = [all_nodes, edges, global_attr, targg]
    return sample


data_df = data_df.apply(lambda row: get_graphical(row), axis=1)
print(data_df[0])

Creating Graph representation using of PyG

In [None]:
import torch
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.loader import DataLoader
from ase.data import atomic_masses, covalent_radii
import pandas as pd
from sklearn.model_selection import train_test_split

#Load dataset
data_df = pd.read_pickle(f"{dataset}/absorption_mp_data.pkl")  # adjust path as needed

# Add global attribute: unit cell volume
def add_volume(row):
    row["unit_cell_volume"] = row["structure"].get_volume()
    return row

data_df = data_df.apply(add_volume, axis=1)

#Convert each row to PyG graph
def row_to_graph(row):
    structure = row["structure"]
    num_atoms = structure.get_global_number_of_atoms()

    # Node features: [atomic number, mass, covalent radius]
    nodes = []
    for atom in structure:
        Z = atom.number
        nodes.append([Z, atomic_masses[Z], covalent_radii[Z]])

    x = torch.tensor(nodes, dtype=torch.float)

    # Edge indices: fully connected (excluding self-loops)
    from_node, to_node = [], []
    for i in range(num_atoms):
        for j in range(num_atoms):
            if i != j:
                from_node.append(i)
                to_node.append(j)
    edge_index = torch.tensor([from_node, to_node], dtype=torch.long)

    # Global attribute and target
    global_attr = torch.tensor([row["unit_cell_volume"]], dtype=torch.float)
    target = torch.tensor([row["absorption_coefficient"]], dtype=torch.float)

    # Create graph object
    data = Data(
        x=x,
        edge_index=edge_index,
        edge_attr=None,
        y=target,
        u=global_attr,
    )

    # Add extra metadata
    data.id = row["id"]
    data.formula = row["formula"]
    data.structure = row["structure"]
    data.energies = row["energies"]

    return data

# Convert entire DataFrame to graph list
graph_list = data_df.apply(row_to_graph, axis=1).tolist()
print(graph_list)