Pulling Data from Multiple Files
=========================

In [1]:
import os
import glob

import pandas as pd

In [2]:
folders = glob.glob("data/rxnpredict/spartan_molecules/*.spardir/M0001")

In [3]:
len(folders)

45

In [4]:
import re

number_pattern = pattern = re.compile("(?P<key>[A-Za-z].*):\s*(?P<value>-?\d*\.\d+)\s+")
name_pattern = re.compile("spartan_molecules/(.*).spardir/")

full_df = pd.DataFrame()

for folder in folders:
    molecule_results = {}
    name = name_pattern.findall(folder)[0]
    print(name)
    
    molecule_results["molecule name"] = name 
    
    file_path = os.path.join(folder, "output")
    
    with open(file_path) as f:
        data = f.read()

    matches = number_pattern.finditer(data)
    
    match_list = []
    for match in matches:
        match_list.append(match.groupdict())
    
    df = pd.DataFrame(match_list)
    df.drop_duplicates(subset="key", inplace=True)
    df.set_index("key", inplace=True)
    df = df.T
    df["name"] = name

    full_df = pd.concat([full_df, df])
        

1-bromo-4-ethylbenzene
3-methyl-5-phenylisoxazole
ethyl-5-methylisoxazole-3-carboxylate
MTBD
5-phenylisoxazole
ethyl-5-methylisoxazole-4-carboxylate
benzo[c]isoxazole
ethyl-3-methylisoxazole-5-carboxylate
methyl-5-(thiophen-2-yl)isoxazole-3-carboxylate
3-methylisoxazole
N,N-dibenzylisoxazol-5-amine
XPhos
ethyl-isoxazole-3-carboxylate
3-bromopyridine
ethyl-isoxazole-4-carboxylate
methyl-5-(furan-2-yl)isoxazole-3-carboxylate
5-phenyl-1,2,4-oxadiazole
1-bromo-4-(trifluoromethyl)benzene
1-iodo-4-(trifluoromethyl)benzene
methyl-isoxazole-5-carboxylate
benzo[d]isoxazole
1-iodo-4-methoxybenzene
1-ethyl-4-iodobenzene
3-chloropyridine
t-BuXPhos
AdBrettPhos
N,N-dibenzylisoxazol-3-amine
ethyl-3-methoxyisoxazole-5-carboxylate
4-phenylisoxazole
3-iodopyridine
5-methylisoxazole
1-chloro-4-methoxybenzene
5-methyl-3-(1H-pyrrol-1-yl)isoxazole
1-chloro-4-(trifluoromethyl)benzene
3-phenylisoxazole
2-bromopyridine
t-BuBrettPhos
1-chloro-4-ethylbenzene
BTMG
5-(2,6-difluorophenyl)isoxazole
1-bromo-4-methoxy

In [5]:
full_df

key,Conformer Program CPU Time,Conformer Program Wall Time,SCF total energy,Quantum Calculation CPU Time : 49,Quantum Calculation Wall Time: 59,Memory Used,Semi-Empirical Program CPU Time,Semi-Empirical Program Wall Time,Surface computation Wall Time: 000:00,Surface computation CPU Time: 000:00,...,C12 H35,H15,N4,N5,N6,N7,P1 C5,P1 P2,N1 H7,N1 H20
value,0.28,0.07,-2884.1616715,15.41,29.99,870.98,0.33,0.03,4.2,4.2,...,,,,,,,,,,
value,0.3,0.08,-516.4202748,,,1.385,0.39,0.04,5.3,5.0,...,,,,,,,,,,
value,0.3,0.07,-552.5505976,,,1.187,0.38,0.04,4.4,4.2,...,,,,,,,,,,
value,0.3,0.09,-478.1320714,,,1.668,0.36,0.05,5.2,5.2,...,,,,,,,,,,
value,0.3,0.08,-477.0990963,,,1.047,0.36,0.04,4.0,3.9,...,,,,,,,,,,
value,0.27,0.07,-552.5579025,,,1.187,0.33,0.04,4.3,4.3,...,,,,,,,,,,
value,0.3,0.06,-399.6653105,,,646.52,0.33,0.03,2.4,2.4,...,,,,,,,,,,
value,0.31,0.08,-552.5506634,,,1.187,0.33,0.04,4.5,4.4,...,,,,,,,,,,
value,0.3,0.07,-1025.7150874,,,1.66,0.44,0.05,7.8,7.5,...,,,,,,,,,,
value,0.31,0.06,-285.3558148,,,325.77,0.33,0.02,1.4,1.4,...,,,,,,,,,,


In [6]:
df_trim = full_df.dropna(axis=1, thresh=40)

In [7]:
df_trim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, value to value
Data columns (total 41 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Conformer Program CPU Time                41 non-null     object
 1   Conformer Program Wall Time               41 non-null     object
 2   SCF total energy                          45 non-null     object
 3   Memory Used                               40 non-null     object
 4   Semi-Empirical Program CPU Time           40 non-null     object
 5   Semi-Empirical Program Wall Time          40 non-null     object
 6   Surface computation Wall Time: 000:00     42 non-null     object
 7   Surface computation CPU Time: 000:00      41 non-null     object
 8   QSAR CPU Time:  000:00                    45 non-null     object
 9   QSAR Wall Time: 000:00                    42 non-null     object
 10  Molecular volume                          41 non-n

In [8]:
df_trim

key,Conformer Program CPU Time,Conformer Program Wall Time,SCF total energy,Memory Used,Semi-Empirical Program CPU Time,Semi-Empirical Program Wall Time,Surface computation Wall Time: 000:00,Surface computation CPU Time: 000:00,QSAR CPU Time: 000:00,QSAR Wall Time: 000:00,...,Qxx,Qyy,Qzz,Qxy,Qxz,Qyz,RMS fit,Properties CPU Time,Properties Wall Time,name
value,0.28,0.07,-2884.1616715,870.98,0.33,0.03,4.2,4.2,0.7,4.9,...,1.741376,-2.157368,0.415992,-0.015867,-0.006774,0.081263,2.781968,0.77,0.45,1-bromo-4-ethylbenzene
value,0.3,0.08,-516.4202748,1.385,0.39,0.04,5.3,5.0,0.7,6.2,...,1.408209,0.137213,-1.545421,-0.000178,-0.109366,0.095875,1.724891,0.78,0.51,3-methyl-5-phenylisoxazole
value,0.3,0.07,-552.5505976,1.187,0.38,0.04,4.4,4.2,0.7,5.0,...,1.131328,0.773168,-1.904496,-0.292758,0.144527,0.249134,1.761835,0.72,0.45,ethyl-5-methylisoxazole-3-carboxylate
value,0.3,0.09,-478.1320714,1.668,0.36,0.05,5.2,5.2,0.7,6.4,...,-0.006387,0.002471,0.003917,-0.021038,-0.010289,-0.004669,2.126557,0.91,0.58,MTBD
value,0.3,0.08,-477.0990963,1.047,0.36,0.04,4.0,3.9,0.7,4.8,...,1.001715,0.253612,-1.255327,-0.00977,-0.088235,-0.027177,1.868316,0.8,0.45,5-phenylisoxazole
value,0.27,0.07,-552.5579025,1.187,0.33,0.04,4.3,4.3,0.7,5.0,...,0.164423,1.458377,-1.6228,0.111382,-0.214553,0.073053,1.800951,0.73,0.48,ethyl-5-methylisoxazole-4-carboxylate
value,0.3,0.06,-399.6653105,646.52,0.33,0.03,2.4,2.4,0.6,3.0,...,-0.466435,1.975396,-1.50896,-0.07236,-0.017347,0.066108,1.785648,0.63,0.31,benzo[c]isoxazole
value,0.31,0.08,-552.5506634,1.187,0.33,0.04,4.5,4.4,0.7,5.4,...,1.318543,0.050256,-1.368799,-0.074529,0.062812,0.252966,1.809514,0.72,0.46,ethyl-3-methylisoxazole-5-carboxylate
value,0.3,0.07,-1025.7150874,1.66,0.44,0.05,7.8,7.5,0.8,8.6,...,1.062404,0.155307,-1.21771,0.083239,-0.008575,0.099978,2.192071,1.0,0.73,methyl-5-(thiophen-2-yl)isoxazole-3-carboxylate
value,0.31,0.06,-285.3558148,325.77,0.33,0.02,1.4,1.4,0.6,2.0,...,0.388443,1.049275,-1.437718,-0.586958,-0.000968,0.02065,1.602218,0.5,0.22,3-methylisoxazole
