## Read Jarvis CFID data
Read the CFID data in its original `json` format and save to `pkl`.

In [None]:
import os, json
import pandas as pd
import numpy as np
import pickle

f = open(os.path.join('D:\\', 'ET-AL', 'Jarvis_cfid', 'd3-5-16-2021.json'), 'r')
data3d = json.load(f)
f.close()

df = pd.DataFrame(data3d)
df.to_pickle('D:/PSED/jarvis_cfid.pkl')

## Data cleaning

In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
data = pd.read_pickle('D:/ET-AL/Jarvis_cfid/jarvis_cfid.pkl')  # Replace with your path
data_w_modulus = data.copy()

# Remove entries with 0 or N/A moduli
data_w_modulus.drop(data_w_modulus[data_w_modulus['bulk_modulus_kv'] == 'na'].index, inplace=True)
data_w_modulus.drop(data_w_modulus[data_w_modulus['shear_modulus_gv'] == 'na'].index, inplace=True)
data_w_modulus.drop(data_w_modulus[data_w_modulus['bulk_modulus_kv'] <= 0].index, inplace=True)
data_w_modulus.drop(data_w_modulus[data_w_modulus['shear_modulus_gv'] <= 0].index, inplace=True)

### Graph embeddings
Prepare `id_prop.csv` and `cif` files for CGCNN, get the graph embeddings of materials with modulus

In [None]:
id_prop = pd.DataFrame(index=data_w_modulus.index)
id_prop['props'] = data_w_modulus.formation_energy_peratom
id_prop.to_csv('id_prop.csv')

from jarvis.core.atoms import Atoms
for ind in data_w_modulus.index:
    Atoms.from_dict(data_w_modulus.atoms[ind]).write_cif('D:/ET-AL/jarvis_cifs/'+str(ind)+'.cif')

In [19]:
import pandas as pd
cgcnn_features = pd.read_pickle('D:/ET-AL/Jarvis_cfid/cgcnn_features.pkl')
cgcnn_features.set_index('cif_id', inplace=True)
cgcnn_features.index = cgcnn_features.index.astype('int')
cgcnn_features.index.name = None

cgcnn_features = cgcnn_features.sort_index()
cgcnn_features.to_pickle('cgcnn_features_sorted.pkl')

Down select according to elements of interest or other criteria.

In [7]:
data_downselect = data_w_modulus.copy()
# Remove compounds containing the following elements
for idx in data_downselect.index:
    if not set(data_downselect.atoms[idx]['elements']).isdisjoint({'H', 'F', 'Cl', 'Br', 'I', 'At', 'He', 'Ne', 'Ar', 'Kr', 'Xe', 'Rn',
    'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Tc',
    'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'}):
        data_downselect.drop(idx, inplace=True)

data_downselect.to_pickle('D:/ET-AL/Jarvis_cfid/data_cleaned.pkl')

In [9]:
# Obtain the graph embeddings
cgcnn_features = pd.read_pickle('D:/PSED/Jarvis_cfid/cgcnn_features_sorted.pkl')
cgcnn_features.sort_index()
cgcnn_features_down = cgcnn_features.loc[data_downselect.index]