In [1]:
import json
from pymatgen.core import Lattice, Structure, Molecule

In [2]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)

In [3]:
import os

filepath_train = 'dichalcogenides_public/structures/'
train_files = os.listdir(filepath_train)

filepath_test = 'dichalcogenides_private/structures/'
test_files = os.listdir(filepath_test)

In [243]:
import numpy as np

def to_coords(crystal, img):
    atoms = crystal.as_dict()['sites']
    defects = []
    Z = {'Se': 34, 'W': 74}
    for atom in atoms:
        if atom['label'] == 'Se' or atom['label'] == 'W':
            
            defects.append([Z[atom['label']], *atom['abc']])
    
    void = np.argwhere(img==0)
    
    grid = np.array([x['abc'] for x in crystal.as_dict()['sites']])
    z_map = sorted(list(set(grid[:,2])))
    
    aux_x = set(zip(grid[:,2],grid[:,0]))
    aux_x = np.array(list(aux_x))
    aux_y = set(zip(grid[:,2],grid[:,1]))
    aux_y = np.array(list(aux_y))
    
    for void_atom in void:
        z = z_map[void_atom[2]]
        
        x_map = sorted(aux_x[np.argwhere(aux_x[:,0] == z)][:,:,1].reshape((-1,)))
        y_map = sorted(aux_y[np.argwhere(aux_y[:,0] == z)][:,:,1].reshape((-1,)))
        
        x = x_map[void_atom[0]]
        y = y_map[void_atom[1]]
        
        defects.append([0,x,y,z])
    return defects

In [245]:
!cd dichalcogenides_public; mkdir frac_coordinates

In [246]:
from tqdm.notebook import tqdm

filepath_save_train = 'dichalcogenides_public/frac_coordinates/'
train_coords = {}
for file in tqdm(train_files):
    crystal = read_pymatgen_dict(filepath_train + file)
    img = np.load(filepath_train[:-11] + 'images/' + file[:-5] + '.npy')
    coords = to_coords(crystal, img)
    train_coords[file[:-5]] = coords
    np.save(filepath_save_train + file[:-5], coords)

  0%|          | 0/2966 [00:00<?, ?it/s]

In [247]:
!cd dichalcogenides_private; mkdir frac_coordinates

In [248]:
from tqdm.notebook import tqdm

filepath_save_test = 'dichalcogenides_private/frac_coordinates/'
test_coords = {}
for file in tqdm(test_files):
    crystal = read_pymatgen_dict(filepath_test + file)
    img = np.load(filepath_test[:-11] + 'images/' + file[:-5] + '.npy')
    coords = to_coords(crystal, img)
    test_coords[file[:-5]] = coords
    np.save(filepath_save_test + file[:-5], coords)

  0%|          | 0/2967 [00:00<?, ?it/s]

In [223]:
import pandas as pd

In [287]:
train_coords_ = pd.DataFrame(train_coords.values(), index=train_coords.keys())
test_coords_ = pd.DataFrame(test_coords.values(), index=test_coords.keys())

In [288]:
def create_columns(x):
    a = []
    for xx in x:
        if xx is not None:
            a.extend(xx)
        else:
            a.extend([-1,-1,-1,-1])
    return a

In [289]:
columns = ['Z1', 'x1', 'y1', 'z1', 'Z2', 'x2', 'y2', 'z2', 'Z3', 'x3', 'y3', 'z3']

train_coords_final = pd.DataFrame(np.array(list(train_coords_.apply(create_columns, axis=1).to_numpy())), 
                            columns=columns, index=train_coords_.index)
test_coords_final = pd.DataFrame(np.array(list(test_coords_.apply(create_columns, axis=1).to_numpy())), 
                           columns=columns, index=test_coords_.index)

In [292]:
train_coords_final.to_csv('frac_coordinates_train.csv')
test_coords_final.to_csv('frac_coordinates_test.csv')