In [2]:
import json
from dataclasses import dataclass, asdict
from timeit import default_timer
import numpy as np

In [3]:
with open('basis_sets/3-21G') as f:
    lines = f.readlines()

In [4]:
lines = [k for i in lines if (k := i.strip()) != '']
lines = [k for k in lines if k[0] != "!"]

In [5]:
lines

['{',
 'keys= { turbomole= }',
 'data= {',
 'H:3-21G',
 '{',
 '2  s',
 '5.4471780              0.1562850',
 '0.8245470              0.9046910',
 '1  s',
 '0.1831920              1.0000000',
 '}',
 'He:3-21G',
 '{',
 '2  s',
 '13.6267000              0.1752300',
 '1.9993500              0.8934830',
 '1  s',
 '0.3829930              1.0000000',
 '}',
 'Li:3-21G',
 '{',
 '3  s',
 '36.8382000              0.0696686',
 '5.4817200              0.3813460',
 '1.1132700              0.6817020',
 '2  s',
 '0.5402050             -0.2631270',
 '0.1022550              1.1433900',
 '1  s',
 '0.0285650              1.0000000',
 '2  p',
 '0.5402050              0.1615460',
 '0.1022550              0.9156630',
 '1  p',
 '0.0285650              1.0000000',
 '}',
 'Be:3-21G',
 '{',
 '3  s',
 '71.8876000              0.0644263',
 '10.7289000              0.3660960',
 '2.2220500              0.6959340',
 '2  s',
 '1.2954800             -0.4210640',
 '0.2688810              1.2240700',
 '1  s',
 '0.0773500 

In [6]:
with open("periodic_table.json", "r") as f:
    periodic_table = json.load(f)

In [7]:
periodic_table

{'H': 1,
 'He': 2,
 'Li': 3,
 'Be': 4,
 'B': 5,
 'C': 6,
 'N': 7,
 'O': 8,
 'F': 9,
 'Ne': 10,
 'Na': 11,
 'Mg': 12,
 'Al': 13,
 'Si': 14,
 'P': 15,
 'S': 16,
 'Cl': 17,
 'Ar': 18,
 'K': 19,
 'Ca': 20,
 'Sc': 21,
 'Ti': 22,
 'V': 23,
 'Cr': 24,
 'Mn': 25,
 'Fe': 26,
 'Co': 27,
 'Ni': 28,
 'Cu': 29,
 'Zn': 30,
 'Ga': 31,
 'Ge': 32,
 'As': 33,
 'Se': 34,
 'Br': 35,
 'Kr': 36,
 'Rb': 37,
 'Sr': 38,
 'Y': 39,
 'Zr': 40,
 'Nb': 41,
 'Mo': 42,
 'Tc': 43,
 'Ru': 44,
 'Rh': 45,
 'Pd': 46,
 'Ag': 47,
 'Cd': 48,
 'In': 49,
 'Sn': 50,
 'Sb': 51,
 'Te': 52,
 'I': 53,
 'Xe': 54,
 'Cs': 55,
 'Ba': 56,
 'La': 57,
 'Ce': 58,
 'Pr': 59,
 'Nd': 60,
 'Pm': 61,
 'Sm': 62,
 'Eu': 63,
 'Gd': 64,
 'Tb': 65,
 'Dy': 66,
 'Ho': 67,
 'Er': 68,
 'Tm': 69,
 'Yb': 70,
 'Lu': 71,
 'Hf': 72,
 'Ta': 73,
 'W': 74,
 'Re': 75,
 'Os': 76,
 'Ir': 77,
 'Pt': 78,
 'Au': 79,
 'Hg': 80,
 'Tl': 81,
 'Pb': 82,
 'Bi': 83,
 'Po': 84,
 'At': 85,
 'Rn': 86,
 'Fr': 87,
 'Ra': 88,
 'Ac': 89,
 'Th': 90,
 'Pa': 91,
 'U': 92,
 'Np': 93,


In [8]:
with open("bipy.xyz", "r") as f:
    bpy = f.readlines()

atoms = set()
for i in bpy:
    if (atom := i.split()[0]) in periodic_table.keys():
        atoms.add(atom)

In [9]:
atoms

{'C', 'Cu', 'F', 'H', 'N', 'O'}

In [10]:
atoms = {atom for i in bpy if (atom := i.split()[0]) in periodic_table.keys()}
atoms

{'C', 'Cu', 'F', 'H', 'N', 'O'}

In [11]:
matches = []
stack = []
start_index = -1

with open('basis_sets/3-21G', 'r') as f:
    file_str = f.read()

In [13]:
from typing import Union

angular_momentum_dict = {
    's': 0,
    'p': 1,
    'd': 2,
    'f': 3,
    'g': 4,
    'h': 5,
}

@dataclass
class Shell:
    def __init__(self, angular_momentum: int, matrix: np.ndarray):
        self.exponents = np.ndarray
        self.coefficients = []
        self.function_type = "gto"
        self.region = "valence"
        self.angular_momentum = angular_momentum
        self.parse_matrix(matrix)

    def parse_matrix(self, matrix: np.ndarray):
        self.exponents = matrix[:, 0].tolist()
        self.coefficients = matrix[:, 1].tolist()

    def as_dict(self):
        retdict = {
            "function_type": self.function_type,
            "region": self.region,
            "angular_momentum": [self.angular_momentum],
            "exponents": self.exponents,
            "coefficients": [self.coefficients]
        }
        return retdict

    def __repr__(self):
        return f"{self.as_dict()}"

@dataclass()
class Basis:
    def __init__(self):
        self.elements = {}

    def add_shell(self, atom: int, shell: dict):
        atom = str(atom)
        if not atom in self.elements:
            self.elements[atom] = dict()
        if not "electron_shells" in self.elements[atom]:
            self.elements[atom]["electron_shells"] = []
        skey = list(shell.keys())[0]
        angular_momentum = angular_momentum_dict[skey]
        self.elements[f"{atom}"]["electron_shells"].append(Shell(angular_momentum, shell[skey]).as_dict())

    def add_element(self, atom: Union[str, int], shells: list):
        if atom is str:
            atom = str(periodic_table[atom])
        atom = str(atom)
        for i in shells:
            self.add_shell(atom, i)

    def as_dict(self):
        return {
            "molssi_bse_schema": {
                "schema_type": "complete",
                "schema_version": "0.1"
            },
            "elements": {
                atom: self.elements[atom] for atom in self.elements
            }
        }

def to_dict(file_str: str) -> dict:
    def find_match():
        stack = []
        matches = []
        for index, char in enumerate(file_str):
            if char == '{':
                stack.append(index)
            elif char == '}' and stack:
                start = stack.pop()
                matches.append((start, index))
        return matches

    def find_atoms_basis() -> list:
        atom_bl = []
        file_list = [i.strip() for i in file_str.split('\n') if i != '']
        for line in file_list:
            if line.strip().startswith('!'):
                continue
            if ':' in line:
                atom_bl.append(line)
        return atom_bl

    matches = sorted(find_match())
    labels = find_atoms_basis()
    ret = dict()
    basis_obj = Basis()

    for k, match in enumerate(matches[3:]):
        fs = file_str[match[0] + 1:match[1]]
        spt = [i.strip() for i in fs.split("\n") if i != '']
        ckey = periodic_table[labels[k].split(":")[0]]
        ret[ckey] = []

        z = 0
        temp_list = []
        while z < len(spt):
            try:
                ln, sp = spt[z].split()
                ln = int(ln)
                if sp in ['s', 'p']:
                    shell_data = []
                    for j in range(ln):
                        if z + j + 1 < len(spt):
                            shell_data.append(spt[z + j + 1].split())
                    temp_list.append({sp: np.array(shell_data).astype(np.float64)})
                    z += ln + 1
                else:
                    z += 1
            except ValueError:
                z += 1
                continue

        ret[ckey] = temp_list
        basis_obj.add_element(ckey, temp_list)
    return ret, basis_obj



In [14]:
with open('basis_sets/3-21G', 'r') as f:
    matches, bc = to_dict(f.read())

In [26]:
with open("3-21g.json", "w") as f:
    json.dump(bc.as_dict(), f, indent=4)