In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from typing import List, Tuple
import numpy as np
import rdkit
import pandas as pd
from pymatgen.core import Structure, Lattice, Molecule, Element, Species

%load_ext autoreload
%autoreload 2

smiles = 'CC(C)C'

m = Chem.MolFromSmiles(smiles)
# necessary to add hydrogen for consistent conformer generation
m = Chem.AddHs(m)





In [1]:
import json
import gzip
from pymatgen.core.structure import Structure, Molecule

from utils import download_url

raw_url= "https://ml.materialsproject.org/projects/matbench_mp_e_form.json.gz"
raw_dir= "../old_data/matbench/mp_e_form"
download_url(raw_url, raw_dir)


json_filename = "../old_data/matbench/mp_e_form/matbench_mp_e_form.json.gz"


with gzip.open(json_filename, 'r') as fin:        # 4. gzip
    json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)

json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)
data = json.loads(json_str) 



#df = pd.DataFrame({"struct":[data_list[0] for data_list in data["data"]], "target":[data_list[1] for data_list in data["data"]]})
#
#df.set_index("struct")

struct= Structure.from_dict(data["data"][0][0])

Using existing file matbench_mp_e_form.json.gz


In [4]:
data['columns']

['structure', 'e_form']

In [15]:
struct.sites

[PeriodicSite: Na (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000],
 PeriodicSite: K (2.1000, 2.1000, 2.1000) [0.5000, 0.5000, 0.5000],
 PeriodicSite: K (0.0000, 0.0000, 2.1000) [0.0000, 0.0000, 0.5000]]

## Figuring out whether bond information is available, if no, how we will compute it

In [3]:
from pymatgen.io.babel import BabelMolAdaptor
from pymatgen.core.bonds import CovalentBond

from pymatgen.core.lattice import Lattice

struct = Structure(
    Lattice.cubic(4.2),
    ["C", "C", "C"],
    [[0, 0, 0], [0.5, 0.5, 0.5], [0,0,0.5]],
    site_properties={"magmom": [-2, 2,2]},
)

#struct= Structure.from_dict(data["data"][0][0])

print(struct.sites)

for species1 in struct.sites:
    # bond = CovalentBond(site1=species1, site2=species2)
    try:
        print("is_bond",CovalentBond.is_bonded(site1=species1, site2=species1))
        print("bond_order",CovalentBond.get_bond_order(site1=species1, site2=species1))
        print("bond length", CovalentBond.get_bond_length(sp1= species1, sp2=species1))
    except:
        print("No data")
        

[PeriodicSite: C (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000], PeriodicSite: C (2.1000, 2.1000, 2.1000) [0.5000, 0.5000, 0.5000], PeriodicSite: C (0.0000, 0.0000, 2.1000) [0.0000, 0.0000, 0.5000]]
is_bond True
No data
is_bond True
No data
is_bond True
No data


In [34]:
struct.sites[2]

PeriodicSite: K (-2.8598, 0.0025, -2.0168) [0.4965, 0.4985, 0.4933]

: 

## Using Element featurization 

In [63]:
from utils import from_smiles_to_molecule_and_coordinates

smiles = 'O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C'

m,pos = from_smiles_to_molecule_and_coordinates(smile=smiles, add_hydrogen=False, seed=12)

from pymatgen.io.babel import BabelMolAdaptor

mol_file = Chem.MolToMolBlock(m)

pymatgen_mol = BabelMolAdaptor.from_string(string_data=mol_file, file_format="mol").pymatgen_mol

print([atom.GetAtomicNum() for atom in m.GetAtoms()])

## need to check atom ordering
elem = pymatgen_mol.species[0]
elem1 = pymatgen_mol.species[1]

#print(pymatgen_mol.species[1].__dict__)

[8, 6, 6, 6, 7, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]


# Testing exsitence of node features for all elements

In [16]:
from collections import defaultdict
features = ["atomic_radius","atomic_mass","average_ionic_radius", "average_cationic_radius", "average_anionic_radius", "max_oxidation_state",
            "min_oxidation_state", "row","group", "is_noble_gas", "is_post_transition_metal", "is_rare_earth_metal", "is_metal", "is_metalloid",
            "is_alkali", "is_alkaline", "is_halogen","is_chalcogen", "is_lanthanoid","is_actinoid", "is_quadrupolar"] 

feature_none_count = defaultdict(list)

for i in range(1,118):
    elem = Element.from_Z(i)
    for feature in features:
        try:
            attr = getattr(elem, feature)
            
            if feature is "max_oxidation_state":
                print(attr)
            
            if attr is None:
                feature_none_count[feature].append(i)
                
        except:
            raise ValueError(f"{feature}")
        
        

1
0
1
2
3
4
5
2
-1
0
1
2
3
4
5
6
7
0
1
2
3
4
5
6
7
6
5
4
4
2
3
4
5
6
7
0
1
2
3
4
5
6
7
8
6
4
3
2
3
4
5
6
7
0
1
2
3
4
4
3
3
3
3
3
4
3
3
3
3
3
3
4
5
6
7
8
6
6
5
4
3
4
5
6
5
0
1
2
3
4
5
6
7
7
6
4
4
4
3
3
3
3
3
0
0
0
0
0
0
0
0
0
0
0
0
0
0


  if feature is "max_oxidation_state":


In [10]:
feature_none_count['atomic_radius']

[2,
 10,
 36,
 54,
 85,
 86,
 87,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117]

In [5]:

for i in range(1,118):
    elem = Element.from_Z(i)
    print(elem.atomic_radius)

0.25 ang
None
1.45 ang
1.05 ang
0.85 ang
0.7 ang
0.65 ang
0.6 ang
0.5 ang
None
1.8 ang
1.5 ang
1.25 ang
1.1 ang
1.0 ang
1.0 ang
1.0 ang
0.71 ang
2.2 ang
1.8 ang
1.6 ang
1.4 ang
1.35 ang
1.4 ang
1.4 ang
1.4 ang
1.35 ang
1.35 ang
1.35 ang
1.35 ang
1.3 ang
1.25 ang
1.15 ang
1.15 ang
1.15 ang
None
2.35 ang
2.0 ang
1.8 ang
1.55 ang
1.45 ang
1.45 ang
1.35 ang
1.3 ang
1.35 ang
1.4 ang
1.6 ang
1.55 ang
1.55 ang
1.45 ang
1.45 ang
1.4 ang
1.4 ang
None
2.6 ang
2.15 ang
1.95 ang
1.85 ang
1.85 ang
1.85 ang
1.85 ang
1.85 ang
1.85 ang
1.8 ang
1.75 ang
1.75 ang
1.75 ang
1.75 ang
1.75 ang
1.75 ang
1.75 ang
1.55 ang
1.45 ang
1.35 ang
1.35 ang
1.3 ang
1.35 ang
1.35 ang
1.35 ang
1.5 ang
1.9 ang
1.8 ang
1.6 ang
1.9 ang
None
None
None
2.15 ang
1.95 ang
1.8 ang
1.8 ang
1.75 ang
1.75 ang
1.75 ang
1.75 ang
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [58]:


for elem in pymatgen_mol.species[:10]:
    print([getattr(elem, key) for key in features])

[0.6, 15.9994, 1.26, 0.0, 1.26, 2, -2, 2, 16, False, False, False, False, False, False, False, False, True, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.65, 14.0067, 0.63, 0.28500000000000003, 1.32, 5, -3, 2, 15, False, False, False, False, False, False, False, False, False, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, False, False, False, False, False, True]
[0.6, 15.9994, 1.26, 0.0, 1.26, 2, -2, 2, 16, False, False, False, False, False, False, False, False, True, False, False, True]
[0.7, 12.0107, 0.3, 0.3, 0.0, 4, -4, 2, 14, False, False, False, False, False, False, Fals

In [56]:
elem.atomic_radius, elem.atomic_mass, elem.data

(0.6,
 15.9994,
 {'Atomic mass': 15.9994,
  'Atomic no': 8,
  'Atomic orbitals': {'1s': -18.758245, '2p': -0.338381, '2s': -0.871362},
  'Atomic radius': 0.6,
  'Atomic radius calculated': 0.48,
  'Boiling point': '90.2 K',
  'Brinell hardness': 'no data MN m<sup>-2</sup>',
  'Bulk modulus': 'no data GPa',
  'Coefficient of linear thermal expansion': 'no data x10<sup>-6</sup>K<sup>-1</sup>',
  'Common oxidation states': [-2],
  'Critical temperature': '154.6 K',
  'Density of solid': 'no data kg m<sup>-3</sup>',
  'Electrical resistivity': 'no data 10<sup>-8</sup> &Omega; m',
  'Electronic structure': '[He].2s<sup>2</sup>.2p<sup>4</sup>',
  'ICSD oxidation states': [-2],
  'Ionic radii': {'-2': 1.26},
  'Liquid range': '35.4 K',
  'Melting point': '54.8 K',
  'Mendeleev no': 101,
  'Mineral hardness': 'no data',
  'Molar volume': '17.36 cm<sup>3</sup>',
  'Name': 'Oxygen',
  'Oxidation states': [-2, -1, 1, 2],
  'Poissons ratio': 'no data',
  'Reflectivity': 'no data %',
  'Refractive 

In [34]:
Element.print_periodic_table()

H                                                                   He 
Li  Be                                          B   C   N   O   F   Ne 
Na  Mg                                          Al  Si  P   S   Cl  Ar 
K   Ca  Sc  Ti  V   Cr  Mn  Fe  Co  Ni  Cu  Zn  Ga  Ge  As  Se  Br  Kr 
Rb  Sr  Y   Zr  Nb  Mo  Tc  Ru  Rh  Pd  Ag  Cd  In  Sn  Sb  Te  I   Xe 
Cs  Ba      Hf  Ta  W   Re  Os  Ir  Pt  Au  Hg  Tl  Pb  Bi  Po  At  Rn 
Fr  Ra      Rf  Db  Sg  Bh  Hs  Mt  Ds  Rg  Cn  Nh  Fl  Mc  Lv  Ts  Og 
        La  Ce  Pr  Nd  Pm  Sm  Eu  Gd  Tb  Dy  Ho  Er  Tm  Yb  Lu     
        Ac  Th  Pa  U   Np  Pu  Am  Cm  Bk  Cf  Es  Fm  Md  No  Lr     
