Install libraries

In [None]:
! pip install sympy 1.3.0 
! pip install chempy 
! pip install schnetpack
! pip install tensorflow

Import & extract trajectory file

In [None]:
import os
if not os.path.exists('./UC-MM-PBC.zip'):
    print("UC-MM-PBC.zip havent imported")
    exit()
if os.path.exists('./UC-MM-PBC.xyz'):
    !unzip ./UC-MM-PBC.zip

Import libraries

In [None]:
# open big file
# recognize integer in front --> go to line 1000th
# start read & write new small files
#   9989 jump each time
import re, math, os,timeit
import numpy as np
import pandas as pd
import chempy as cp 

Split trajectory to geometry files

In [None]:
# ----------------------------------- generate xyz geometry files from trajectory -----------------------------------
def splitToMultiples(fileToRead, numGeomToSkip, maxGeomToRead):
    ''' take in xyz file 
        Input: fileName, #geoms to skip, #geoms to read
        Output: <maxGeomToRead> xyz geometry files
    '''
  fileReader = open(fileToRead,"r",encoding='utf-8')
  line = fileReader.readline()
  NUM_COORDS = line
  
  # skip to startLine  
  startLine = numGeomToSkip * (int(NUM_COORDS.strip()) + 2) + 1
  for _ in range(startLine):
    next(fileReader)

  # start reading
  fileCount = 0
  fileWriteName = os.path.join(".", "output","geom_"+str(fileCount)+".xyz")
  os.makedirs(os.path.dirname(fileWriteName), exist_ok=True) # create 'output' folder if not existed yet
  currFileWriter = open(fileWriteName,"a")
  
  while (fileCount <= maxGeomToRead):
    line = fileReader.readline()
    if re.match(' generated by VMD\n', line) : 
      continue
    elif re.match(NUM_COORDS, line):
      currFileWriter.close()
      fileCount += 1
      # creat new file --- currentFile    
      fileWriteName = os.path.join(".", "output","geom_"+str(fileCount)+".xyz")
      os.makedirs(os.path.dirname(fileWriteName), exist_ok=True) # create 'output' folder if not existed yet
      currFileWriter = open(fileWriteName,"a")
      
    else:    
      # append to new file
      currFileWriter.write(line)

  currFileWriter.close()
  fileReader.close()

Call to split - skip first 10 geometries to discard the non-stabilized ---> read the next 1000 or "n" geoms in the trajectory

In [None]:
fileToRead = "UC-MM-PBC.xyz"
splitToMultiples(fileToRead, 10, 1000)  # skip first 10 geometries to discard the non-stabilized ---> read the next 1000 geoms in the trajectory

Result check

In [None]:
# ------------------------- Check number of output files ------------------------- 
mydir = "output"
# os.remove(os.path.join(mydir,".ipynb_checkpoints"))
count = 0
for f in os.listdir(mydir):
    if f == ".ipynb_checkpoints": 
        continue
    else:
    print(f)
    count += 1
print(count)    

Generate Gaussian *.inp* files - can choose to read waters or *not* *(comment out)*

In [None]:
# ----------------------------------- generate Gaussian inp files -----------------------------------
# configurations
RADIUS = 0 # 6.00 angstrom -- since CoM in a couple of angstrom inside the INDOLE molecule
INDOLE_NUM_ATOM = 16
HEADER = '{}\n{}\n{}\n\n{}\n\n{}\n'.format('%mem=1gb', 
                                                    '%nproc=1',
                                                    '#P TD(NStates=3) CAM-B3LYP/6-31G(d) SP',
                                                    'C8H7N-excited',
                                                    '0 1')      
# change "HEADER" if use different molecule

In [None]:
def genGaussianInput(fileName):
  '''
    take in file
    filter
      find CoM_indole
      keep looping till end of file / 3 lines at a time 
        True: write 3 lines
    close
  '''
  filePath = os.path.join(".", "output",fileName)
  print("Parsing file" + filePath)
  data = pd.read_csv(filePath, sep="\\s+", header=None)
  data[0] = data[0].apply(lambda x: cp.util.periodic.mass_from_composition(cp.util.parsing.formula_to_composition(x[0])))


  np_mass, np_x, np_y, np_z = data[0].to_numpy(dtype=np.float64), \
                                  data[1].to_numpy(dtype=np.float64), \
                                  data[2].to_numpy(dtype=np.float64), \
                                  data[3].to_numpy(dtype=np.float64)      
  #----------------------------------------------------
  start_indx = INDOLE_NUM_ATOM  
  indole_CoM = np.asarray([np.dot(np_mass[:start_indx],np_x[:start_indx])/np.sum(np_mass[:start_indx]), \
                           np.dot(np_mass[:start_indx],np_y[:start_indx])/np.sum(np_mass[:start_indx]), \
                           np.dot(np_mass[:start_indx],np_z[:start_indx])/np.sum(np_mass[:start_indx])])

  fileReader = open(filePath,"r",encoding='utf-8')

  # -------------------- start writing --------------------
  fileWriteName = os.path.join(".", "GaussInput",os.path.splitext(fileName)[0]+".inp")
  os.makedirs(os.path.dirname(fileWriteName), exist_ok=True) # create folder if not existed yet
  fileWriter = open(fileWriteName,"w+")

  # write header
  header = HEADER
  fileWriter.write(header)

  # read INDOLE over
  for _ in range(start_indx):
    line = fileReader.readline()
    first = line.split()[0]
    line = line.replace(first,first[0])
    fileWriter.write(line)

  # # ----------------------------------- read WATERs over ----------------------------------- 
  # while start_indx < len(np_mass):
  #   '''
  #   compute CoM
  #   satisfy, write lines
  #   '''
  #   water_CoM = np.asarray([np.dot(np_mass[start_indx:start_indx+3],np_x[start_indx:start_indx+3])/np.sum(np_mass[start_indx:start_indx+3]), \
  #                           np.dot(np_mass[start_indx:start_indx+3],np_y[start_indx:start_indx+3])/np.sum(np_mass[start_indx:start_indx+3]), \
  #                           np.dot(np_mass[start_indx:start_indx+3],np_z[start_indx:start_indx+3])/np.sum(np_mass[start_indx:start_indx+3])])
  #   # ---- filter-----
  #   numline = 0
  #   if np.linalg.norm(indole_CoM - water_CoM) <= RADIUS:
  #     while numline < 3:
  #       line = fileReader.readline()
  #       first = line.split()[0]
  #       line = line.replace(first,first[0])
  #       fileWriter.write(line)
  #       numline=numline+1  
  #   else:
  #     while numline < 3:
  #       line = fileReader.readline()
  #       numline=numline+1
  #   start_indx = start_indx + 3
  fileWriter.close()
  fileReader.close()

def addEndline(fileName):
  '''
    take in file > seek(end) > add newline > close
  '''
  f = open(fileName, 'a+')
  f.write("\n")
  f.close()

Process each *xyz* input file ---> Gaussian *.inp* files

In [None]:
# ----------------------------- process each xyz input file - get Gaussian inp files ----------------------------- 
# can use pythonmultiprocessing for this
mydir = "output"
for f in os.listdir(mydir):
    if f == ".ipynb_checkpoints": 
        continue
    else:
        genGaussianInput(f)
        addEndline(f)

!zip -r GaussInput.zip GaussInput/ 

From here ----> send *GaussInput*.*zip* to the computing cluster with Gaussian ----> run quantum simulations to get .*log* files





Check results after Gaussian run

In [None]:
# ---------------- check results -----------------
import os, os.path
# path joining version for other paths
DIR = './GaussInput'
print(len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]))
# path joining version for other paths
DIR2 = './GaussOutput'
print(len([name for name in os.listdir(DIR2) if os.path.isfile(os.path.join(DIR2, name))]))