In [10]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from typing import List, Tuple
import numpy as np
import rdkit
import pandas as pd
from pymatgen.core import Structure, Lattice, Molecule, Element, Species

%load_ext autoreload
%autoreload 2

smiles = 'CC(C)C'

m = Chem.MolFromSmiles(smiles)
# necessary to add hydrogen for consistent conformer generation
m = Chem.AddHs(m)





The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import json
import gzip
from pymatgen.core.structure import Structure, Molecule

from utils import download_url

#raw_url= "https://ml.materialsproject.org/projects/matbench_mp_is_metal.json.gz"
#raw_dir= "../data/matbench/mp_is_metal"
#download_url(raw_url, raw_dir)


json_filename = "../old_data/matbench/mp_is_metal/raw/matbench_mp_is_metal.json.gz"


with gzip.open(json_filename, 'r') as fin:        # 4. gzip
    json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)

json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)
data = json.loads(json_str) 



df = pd.DataFrame({"struct":[data_list[0] for data_list in data["data"]], "target":[data_list[1] for data_list in data["data"]]})

df.set_index("struct")

struct= Structure.from_dict(data["data"][0][0])

In [15]:
struct.sites

[PeriodicSite: Na (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000],
 PeriodicSite: K (2.1000, 2.1000, 2.1000) [0.5000, 0.5000, 0.5000],
 PeriodicSite: K (0.0000, 0.0000, 2.1000) [0.0000, 0.0000, 0.5000]]

## Figuring out whether bond information is available, if no, how we will compute it

In [26]:
from pymatgen.io.babel import BabelMolAdaptor
from pymatgen.core.bonds import CovalentBond

from pymatgen.core.lattice import Lattice

struct = Structure(
    Lattice.cubic(4.2),
    ["C", "C", "C"],
    [[0, 0, 0], [0.5, 0.5, 0.5], [0,0,0.5]],
    site_properties={"magmom": [-2, 2,2]},
)

struct= Structure.from_dict(data["data"][0][0])

print(struct.sites)

for species1 in struct.sites:
    # bond = CovalentBond(site1=species1, site2=species2)
    try:
        print("is_bond",CovalentBond.is_bonded(site1=species1, site2=species1))
        print("bond_order",CovalentBond.get_bond_order(site1=species1, site2=species1))
    except:
        print("No data")
        

[PeriodicSite: K (-0.0081, 0.0248, -0.0170) [0.0009, 0.0026, 0.0054], PeriodicSite: K (-0.7711, -2.7366, -1.5006) [0.5046, 0.0027, 0.0057], PeriodicSite: K (-2.8598, 0.0025, -2.0168) [0.4965, 0.4985, 0.4933], PeriodicSite: K (-5.6848, -0.0017, -0.5570) [0.4967, 0.9950, 0.4936], PeriodicSite: Mn (-2.8534, 0.0087, -5.5176) [0.9933, 0.4933, 0.9868], PeriodicSite: Mn (0.7133, 2.8014, -2.0991) [0.0059, 0.0060, 0.5120], PeriodicSite: Mn (-2.1323, 2.7958, -0.6253) [0.0057, 0.5060, 0.5116], PeriodicSite: Mn (-2.0977, 2.7508, -4.0481) [0.4937, 0.4935, 0.9871], PeriodicSite: O (-3.8175, 1.7094, -0.4023) [0.1909, 0.7497, 0.4992], PeriodicSite: O (-3.5912, 1.4136, -4.2721) [0.7499, 0.6911, 0.9997], PeriodicSite: O (-2.8918, -4.1163, -1.2998) [0.7501, 0.3087, 0.0001], PeriodicSite: O (-4.7507, -1.6787, -2.2169) [0.8079, 0.7493, 0.4986], PeriodicSite: O (-0.6151, 4.1133, -3.8617) [0.2415, 0.2990, 0.9831], PeriodicSite: O (-1.9403, -1.6202, -3.7550) [0.8153, 0.2579, 0.5157], PeriodicSite: O (-1.0083,

In [34]:
struct.sites[2]

PeriodicSite: K (-2.8598, 0.0025, -2.0168) [0.4965, 0.4985, 0.4933]

: 

## Using Element featurization 

In [63]:
from utils import from_smiles_to_molecule_and_coordinates

smiles = 'O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C'

m,pos = from_smiles_to_molecule_and_coordinates(smile=smiles, add_hydrogen=False, seed=12)

from pymatgen.io.babel import BabelMolAdaptor

mol_file = Chem.MolToMolBlock(m)

pymatgen_mol = BabelMolAdaptor.from_string(string_data=mol_file, file_format="mol").pymatgen_mol

print([atom.GetAtomicNum() for atom in m.GetAtoms()])

## need to check atom ordering
elem = pymatgen_mol.species[0]
elem1 = pymatgen_mol.species[1]

#print(pymatgen_mol.species[1].__dict__)

[8, 6, 6, 6, 7, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]


# Testing exsitence of node features for all elements

In [5]:
features = ["X","atomic_radius","atomic_mass","average_ionic_radius", "average_cationic_radius", "average_anionic_radius", "max_oxidation_state",
            "min_oxidation_state", "row","group", "is_noble_gas", "is_post_transition_metal", "is_rare_earth_metal", "is_metal", "is_metalloid",
            "is_alkali", "is_alkaline", "is_halogen","is_chalcogen", "is_lanthanoid","is_actinoid", "is_quadrupolar"] 
arr=[]

for i in range(1,118):
    elem = Element.from_Z(i)
    arr.append(elem.X)
    for feature in features:
        try:
            getattr(elem, feature)
        except:
            raise ValueError(f"{feature}")
        
        
np.nanmean(arr)


1.713

In [58]:


for elem in pymatgen_mol.species[:10]:
    print([getattr(elem, key) for key in features])

[0.6, 15.9994, 1.26, 0.0, 1.26, 2, -2, 2, 16, False, False, False, False, False, False, False, False, True, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.65, 14.0067, 0.63, 0.28500000000000003, 1.32, 5, -3, 2, 15, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.6, 15.9994, 1.26, 0.0, 1.26, 2, -2, 2, 16, False, False, False, False, False, False, False, False, True, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, Fals

In [56]:
elem.atomic_radius, elem.atomic_mass, elem.data

(0.6,
 15.9994,
 {'Atomic mass': 15.9994,
  'Atomic no': 8,
  'Atomic orbitals': {'1s': -18.758245, '2p': -0.338381, '2s': -0.871362},
  'Atomic radius': 0.6,
  'Atomic radius calculated': 0.48,
  'Boiling point': '90.2 K',
  'Brinell hardness': 'no data MN m<sup>-2</sup>',
  'Bulk modulus': 'no data GPa',
  'Coefficient of linear thermal expansion': 'no data x10<sup>-6</sup>K<sup>-1</sup>',
  'Common oxidation states': [-2],
  'Critical temperature': '154.6 K',
  'Density of solid': 'no data kg m<sup>-3</sup>',
  'Electrical resistivity': 'no data 10<sup>-8</sup> &Omega; m',
  'Electronic structure': '[He].2s<sup>2</sup>.2p<sup>4</sup>',
  'ICSD oxidation states': [-2],
  'Ionic radii': {'-2': 1.26},
  'Liquid range': '35.4 K',
  'Melting point': '54.8 K',
  'Mendeleev no': 101,
  'Mineral hardness': 'no data',
  'Molar volume': '17.36 cm<sup>3</sup>',
  'Name': 'Oxygen',
  'Oxidation states': [-2, -1, 1, 2],
  'Poissons ratio': 'no data',
  'Reflectivity': 'no data %',
  'Refractive 

In [34]:
Element.print_periodic_table()

H                                                                   He 
Li  Be                                          B   C   N   O   F   Ne 
Na  Mg                                          Al  Si  P   S   Cl  Ar 
K   Ca  Sc  Ti  V   Cr  Mn  Fe  Co  Ni  Cu  Zn  Ga  Ge  As  Se  Br  Kr 
Rb  Sr  Y   Zr  Nb  Mo  Tc  Ru  Rh  Pd  Ag  Cd  In  Sn  Sb  Te  I   Xe 
Cs  Ba      Hf  Ta  W   Re  Os  Ir  Pt  Au  Hg  Tl  Pb  Bi  Po  At  Rn 
Fr  Ra      Rf  Db  Sg  Bh  Hs  Mt  Ds  Rg  Cn  Nh  Fl  Mc  Lv  Ts  Og 
        La  Ce  Pr  Nd  Pm  Sm  Eu  Gd  Tb  Dy  Ho  Er  Tm  Yb  Lu     
        Ac  Th  Pa  U   Np  Pu  Am  Cm  Bk  Cf  Es  Fm  Md  No  Lr     
