In [1]:
import pandas as pd
import os 
from ase.io import read,write
import random

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors3D
from rdkit.Chem import AllChem


from pymatgen.core.structure import Structure,Molecule
from pyxtal.io import search_molecules_in_crystal
from ase import Atoms

import numpy as np 

import spglib as spg


Ptable = Chem.GetPeriodicTable()

Enegs = {"O": 3.44,
        "C": 2.55,
        "N": 3.04,
        "F": 3.98,
        "Cl": 3.16,
        "H": 2.20,
        "P": 2.19,
        "S": 2.58}


In [2]:

def rotate_coords(Rmat,coords):
    coords = [np.matmul(Rmat.T,c) for c in coords]
    return coords

def rotate_matrix(angles,m_i):    
    matrix = m_i
    
    theta = angles[0] #* np.pi / 180
    psi = angles[1] #* np.pi / 180
    phi = angles[2] #* np.pi / 180
    R_x = np.array([1,0,0,
                    0,np.cos(psi),-np.sin(psi),
                    0,np.sin(psi),np.cos(psi)]).reshape(3,3)
    
    R_y = np.array([np.cos(theta),0,np.sin(theta),
                   0,1,0,
                   -np.sin(theta),0,np.cos(theta)]).reshape(3,3)
    
    R_z = np.array([np.cos(phi),-np.sin(phi),0,
                    np.sin(phi),np.cos(phi),0,
                    0,0,1]).reshape(3,3)

    
    R_tot = np.matmul(R_z,np.matmul(R_y,R_x))
    v0 = np.matmul(R_tot,m_i[0])
    v1 = np.matmul(R_tot,m_i[1])
    v2 = np.matmul(R_tot,m_i[2])

    matrix = np.array([v0,v1,v2]).T
    
    #matrix = np.flip(matrix) #Flipped wrt initial basis
    return matrix

def get_euler_angles(m_i):
    # print('Using full Euler: https://eecs.qmul.ac.uk/~gslabaugh/publications/euler.pdf')
    if m_i[2,0] != 1 or m_i[2,0] != -1:
        theta_1 = -np.arcsin(m_i[2,0])
        theta_2 = np.pi - theta_1
        
        psi_1 = np.arctan2(m_i[2,1]/np.cos(theta_1),
                            m_i[2,2]/np.cos(theta_1))
        psi_2 = np.arctan2(m_i[2,1]/np.cos(theta_2),
                            m_i[2,2]/np.cos(theta_2))
        
        phi_1 = np.arctan2(m_i[1,0]/np.cos(theta_1),
                            m_i[0,0]/np.cos(theta_1))
        phi_2 = np.arctan2(m_i[1,0]/np.cos(theta_2),
                            m_i[0,0]/np.cos(theta_2))
        
        return np.array([theta_1,psi_1,phi_1,theta_2,psi_2,phi_2])
    else:
        phi = 0
        if m_i[2,0] == -1:
            theta = np.pi/2
            psi = phi + np.arctan2(m_i[0,1],m_i[0,2])
        else:
            theta = -np.pi/2
            psi = -phi + np.arctan2(-m_i[0,1],-m_i[0,2])
        return phi,theta,psi


def params_to_matrix(params):
    a = params[0]
    b = params[1]
    c = params[2]
    alpha = params[3] / 180 * np.pi
    beta = params[4] / 180 * np.pi
    gamma = params[5] / 180 * np.pi
    n = (np.cos(alpha) - np.cos(gamma) * np.cos(beta)) / np.sin(gamma)
    M = np.array([a, 0, 0, 
                  b * np.cos(gamma), b * np.sin(gamma), 0, 
                  c * np.cos(beta), c * n, c * np.sqrt(np.sin(beta) ** 2 - n ** 2)])
    M = M.reshape(3, 3)
    return M


class ProcessCell:
    def __init__(self,s):
        self.structure = s

    def standard_cell(self):
        numbers = [self.structure.sites[x].specie.number for x in range(len(self.structure.sites))]
        lattice = self.structure.lattice.matrix
        positions = self.structure.frac_coords
        cell = (lattice, positions, numbers)
        dataset = spg.get_symmetry_dataset(cell, symprec=1E-3)
        self.standard_structure = Structure(dataset['std_lattice'], dataset['std_types'], dataset['std_positions'])
    def get_dataset(self):
        numbers = [self.standard_structure.sites[x].specie.number for x in range(len(self.standard_structure.sites))]
        lattice = self.standard_structure.lattice.matrix
        positions = self.standard_structure.frac_coords
        cell = (lattice, positions, numbers)
        dataset = spg.get_symmetry_dataset(cell, symprec=1E-3)
        self.standard_dataset = spg.get_symmetry_dataset(cell, symprec=1E-3)


def get_full_cell_parameters(structure):
    """
    Gets the full set of cell parameters from a STANDARDIZED CELL 
    [
        a, 0, 0, 
        b * np.cos(gamma), b * np.sin(gamma), 0, 
        c * np.cos(beta), c * n, c * np.sqrt(np.sin(beta) ** 2 - n ** 2)
    ]
    """

    try:
        molecs = search_molecules_in_crystal(structure)
    except:
        print('Pyxtal sucks.')
        return None, None, None
    coms = [np.linalg.norm(m.center_of_mass) for m in molecs]
    canon = molecs[coms.index(min(coms))] #Closest to origin
    species = [n.as_dict()['name'] for n in canon.sites]
    coords = [n.as_dict()['xyz'] for n in canon.sites]
    
    ase_canon = Atoms(species,coords)

    # print(get_moments_of_inertia(ase_canon))
    com = ase_canon.get_center_of_mass()
    abc_com = np.matmul(np.linalg.inv(structure.lattice.matrix).T,com)
    ase_canon.set_positions(ase_canon.get_positions()-com)
    m_i = ase_canon.get_moments_of_inertia(vectors=True)[1]

    m_i = right_hand_rule(m_i)
    
    aligned_canon = [np.matmul(m_i,c) for c in ase_canon.get_positions()]
    ase_canon.set_positions(aligned_canon)

    
    latt_abc = structure.lattice.abc
    latt_angles = structure.lattice.angles
    
    euler_angles = get_euler_angles(m_i)

    full_cell_parameters = np.array([latt_abc[0],latt_abc[1],latt_abc[2],latt_angles[0],latt_angles[1],latt_angles[2],
                            abc_com[0],abc_com[1],abc_com[2],euler_angles[0],euler_angles[1],euler_angles[2]])

    return full_cell_parameters, ase_canon,m_i


def right_hand_rule(m_i):
    eps = 1E-10
    is_rh = np.linalg.norm(np.cross(m_i[0],m_i[1]) - m_i[2]) < eps

    if is_rh:
        return m_i
    else:
        # print('Not right-handed')
        rot = np.array([-1,0,0,0,1,0,0,0,1]).reshape(3,3)
        m_i = np.matmul(rot,m_i)
        #m_i[0] *= -1
        return m_i


def get_cell_parameters(struc):
    #Gets all standard rigid molecule parameters from input pymatgen structure
    standardized = ProcessCell(struc)
    standardized.standard_cell()
    standardized.get_dataset()
    std_struc = standardized.standard_structure
    full_cell_parameters,canonical_conformer,m_i = get_full_cell_parameters(std_struc)
    return full_cell_parameters,canonical_conformer



def pull_dataset(path):
    print('Pulling dataset statistics...')
    Ncifs = len(os.listdir(path))
    cell_parms = np.zeros([Ncifs,12])
    canon_confs = [None] * Ncifs
    for i,file in enumerate(os.listdir(path)):
        ciffile = os.path.join(path,file)
        try:
            struc = Structure.from_file(ciffile)
        except:
            print('Cannot Parse {}')
            continue
        C,conf = get_cell_parameters(struc)
        if C is None:
            continue
        cell_parms[i] = C
        canon_confs[i] = conf
    return cell_parms, canon_confs


def distort(c,averages,scale=0.00,stddev=0.05,center=0,shape=12):
    
    noise = np.random.normal(0, stddev, shape)
    C_std = ((c - averages)/np.std(c)**2)
    C_dist = C_std + noise * scale
    C_new = C_dist *np.std(c)**2 + averages
    return C_new

def distort_std(c,stddevs,averages,scale=0.01,stddev=0.5,center=0,shape=12):
    noise = np.random.normal(0, stddev, shape)
    C_std = ((c - averages)/stddevs**2)
    C_dist = C_std + noise * scale
    C_new = C_dist * stddevs ** 2 + averages
    return C_new
    

In [3]:
#Standardizing cifs

#Full parameter set for Z'=1 crystal:
"""
    latt_a
    latt_b
    latt_c
    latt_alpha
    latt_beta
    latt_gamma
    com_a
    com_b
    com_c
    mol_phi
    mol_psi
    mol_theta

"""

#transform back to cartesian
"""
rotate matrix: eigenvectors of the principle moments of inertia form an orthogonormal basis that represents the rigid
    body's orientation in 3D. The euler angles describe the orientation of these vectors wrt
    to Cartesian unit vectors. This allows us to reorient the molecule to its original placement in lattice, as well
    as apply small perturbations to just these 3 angles for generating "false" xtals.

Using the 12 cell parameters to generate a an asymmetric cell. Unit cell is regenerated using the original space group
TODO:    
    Gaussian noise/variability in params. 
"""

#Replace cell as is 
os.makedirs('replaced',exist_ok=True)

for file in os.listdir('cif'):
    print(file)
    ciffile = os.path.join('cif',file)
    struc = Structure.from_file(ciffile)
    standardized = ProcessCell(struc)
    standardized.standard_cell()
    standardized.get_dataset()
    std_struc = standardized.standard_structure
    std_dataset = standardized.standard_dataset

    # std_struc.to(os.path.join('standardized_cell',file))

    
    full_cell_parameters,canonical_conformer,m_i = get_full_cell_parameters(std_struc)

    canonical_coords = canonical_conformer.get_positions()
    canonical_species = canonical_conformer.get_chemical_symbols()
    aligned_moments = canonical_conformer.get_moments_of_inertia(vectors=True)[1]
        
    matrix = params_to_matrix(full_cell_parameters[0:6])    
    m_i_rev = rotate_matrix(full_cell_parameters[9:],np.eye(3,3))

    
    rotated_coords = rotate_coords(m_i_rev,canonical_coords)
    abc_coords = [np.matmul(np.linalg.inv(matrix).T,xyz) + full_cell_parameters[6:9] for xyz in rotated_coords]
    
    uc_coords = np.array([])
    uc_species = np.array([])
    xform = std_dataset['transformation_matrix']
    for r,t in zip(std_dataset['rotations'],std_dataset['translations']):
        img_abc = np.array([])
        img_species = []
        for sym,abc in zip(canonical_species,abc_coords):
            img_abc = np.append(img_abc,(np.matmul(r,abc)+t))
            img_species.append(sym)
        img_abc = img_abc.reshape([-1,3])
        uc_coords = np.append(uc_coords,img_abc)
        uc_species = np.append(uc_species,img_species,axis=0)
    
    uc_coords = uc_coords.reshape([-1,3])
    
    new_struc = Structure(lattice=matrix,species=uc_species,coords=uc_coords)
    new_struc.to(os.path.join('replaced',file))

    


KABGUR.cif
KAGCIE.cif
XUHZEF.cif
XYANAC.cif
DIFSAO.cif
AKENOU.cif


In [4]:
#Pull the dataset statistics for distortion stuff

os.makedirs('perturbed',exist_ok=True)



cell_parms, canon_confs = pull_dataset('cif')
averages = np.array([np.mean(cell_parms[:,i]) for i in range(12)])
stddevs = np.array([np.std(cell_parms[:,i]) for i in range(12)])
stddevs[np.where(stddevs==0)] = 1E-5


Pulling dataset statistics...


In [5]:
#Distort stuff
print('STARTING DISTORTED SET')
for file in os.listdir('cif'):
    print(file)
    ciffile = os.path.join('cif',file)
    try:
        struc = Structure.from_file(ciffile)
    except:
        print('parser failed')
        continue
    standardized = ProcessCell(struc)
    standardized.standard_cell()
    standardized.get_dataset()
    std_struc = standardized.standard_structure
    std_dataset = standardized.standard_dataset
    
    full_cell_parameters,canonical_conformer,m_i = get_full_cell_parameters(std_struc)

    if canonical_conformer is None:
        continue
        
    canonical_coords = canonical_conformer.get_positions()
    canonical_species = canonical_conformer.get_chemical_symbols()
    aligned_moments = canonical_conformer.get_moments_of_inertia(vectors=True)[1]

    distorted = distort_std(full_cell_parameters,stddevs,averages,scale=0.05)
        
    matrix = params_to_matrix(distorted[0:6])    
    m_i_rev = rotate_matrix(distorted[9:],np.eye(3,3))
    
    rotated_coords = rotate_coords(m_i_rev,canonical_coords)
    abc_coords = [np.matmul(np.linalg.inv(matrix).T,xyz) + distorted[6:9] for xyz in rotated_coords]
    
    uc_coords = np.array([])
    uc_species = np.array([])
    xform = std_dataset['transformation_matrix']
    for r,t in zip(std_dataset['rotations'],std_dataset['translations']):
        img_abc = np.array([])
        img_species = []
        for sym,abc in zip(canonical_species,abc_coords):
            img_abc = np.append(img_abc,(np.matmul(r,abc)+t))
            img_species.append(sym)
        img_abc = img_abc.reshape([-1,3])
        uc_coords = np.append(uc_coords,img_abc)
        uc_species = np.append(uc_species,img_species,axis=0)
    
    uc_coords = uc_coords.reshape([-1,3])
    
    new_struc = Structure(lattice=matrix,species=uc_species,coords=uc_coords)
    new_struc.to(os.path.join('perturbed',file))

STARTING DISTORTED SET
KABGUR.cif
KAGCIE.cif
XUHZEF.cif
XYANAC.cif
DIFSAO.cif
AKENOU.cif


In [6]:
distorted

array([ 8.13289706,  8.62159996, 27.89151857, 89.69679913, 93.01618232,
       91.97594987,  0.71861568,  0.6978528 , -0.10831718, -1.32606854,
       -2.83322767, -0.39418809])

In [22]:
full_cell_parameters

In [70]:
c

array([ 3.87130000e+00,  1.27700000e+01,  9.97400000e+00,  9.00000000e+01,
        9.33970000e+01,  9.00000000e+01, -5.16913645e-02,  2.97063827e-01,
        3.11630562e-01, -1.05015053e+00,  1.25060516e+00, -2.93231987e+00])

In [71]:
C_new

array([ 3.64617212e+00,  1.29647991e+01,  9.85146573e+00,  9.04330003e+01,
        9.36529008e+01,  9.05343384e+01, -1.35764475e-01,  8.15190959e-02,
        2.59314496e-01, -7.10855802e-01,  1.25412045e+00, -2.98831753e+00])

In [32]:
gaussian_noise

array([ 0.09327337,  0.00446796,  0.00769386, -0.09934378, -0.006463  ,
        0.00412857, -0.03663375, -0.02819695, -0.0789602 ,  0.06380148,
        0.15123125, -0.0983077 ])

In [40]:
noise

array([-0.13224706,  0.01122095, -0.00760959, -0.0459745 ,  0.01275027,
        0.1422896 ,  0.1427657 ,  0.04481617,  0.25067536,  0.26835434,
       -0.09690747,  0.04769525])

In [41]:
C_std

array([-1.36859972e-03, -6.27591934e-04, -4.52247048e-04,  0.00000000e+00,
       -4.63043572e-03,  0.00000000e+00, -2.65399420e-04,  2.93866748e-05,
       -8.07285164e-05, -5.46935189e-04,  8.86354820e-04, -1.22046183e-03])