In [5]:
" Get GDB9 (QM9) Molecular properties """

# import libraries
import os
import pandas as pd
from pyscf.data import nist

#conversion en eV
au2ev = nist.HARTREE2EV
# paths to data
main_dir = os.getcwd()
path_to_gdb9 = os.path.join(main_dir + '/GDB-9') 
path_to_unch = os.path.join(main_dir + '/uncharacterized.txt')

# open and read the uncharacterized.txt file, changing the encoding to 'utf-8'
unch_file = open(path_to_unch, encoding='utf-8')

# read the lines in the file
lin = unch_file.readlines()

# create an empty list for storing uncharacterized molecules via their ids
unch_ids = []

# store uncharacterized molecules
for i, line in enumerate(lin):
    if i >= 9:
       mol = line.split()
       ids = mol[0]
       
       unch_ids.append(ids)
       
# get gdb9 data files as ordered list
gdb9_files = sorted([f for f in os.listdir(path_to_gdb9) if f.endswith('.xyz')])

# collect gdb9 properties
gdb9_prop = []
for i, file in enumerate(gdb9_files):
    gdb9_path = os.path.join(path_to_gdb9, file)
    
    # verbose 
    if i%10000 == 0:
       print('Processing the file: %s' %gdb9_path)
       
    # open files with 'utf-8' encoding
    fp = open(gdb9_path, encoding='utf-8')
    # read the lines in the file
    l = fp.readlines()
    prop = l[1].split()
    _smiles = l[-2].split()
    
    # get molecular id, homo, lumo, gap and smiles
    id, homo, lumo, gap = f'{prop[0]} {prop[1]}', prop[7], prop[8], prop[9]
    smiles = _smiles[0]
    HOMO=float(homo)*au2ev
    LUMO=float(lumo)*au2ev
    GAP=float(gap)*au2ev
    # remove uncharacterized molecules
    if id in unch_ids: continue
    
    # fill the list
    gdb9_prop.append((id, smiles, float(homo), float(lumo), float(gap),HOMO,LUMO,GAP))
    
# create a dataframe for property storage
gdb9_df = pd.DataFrame(gdb9_prop)
gdb9_df.columns = ['smiles_key','SMILES','HOMO', 'LUMO', 'Gap','HOMO(eV)', 'LUMO(eV)', 'Gap(eV)'] 

gdb9_df.to_pickle(main_dir + '/GDB9Prop.pkl')  
gdb9_df.to_csv('validationGDB9.csv')

Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_000001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_010001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_020001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_030001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_040001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_050001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_060001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_070001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_Crest Validation Modele/GDB-9/dsgdb9nsd_080001.xyz
Processing the file: /home/mvotokps/Downloads/NEW xTB_C

In [6]:
df1=gdb9_df

Unnamed: 0,smiles_key,SMILES,HOMO,LUMO,Gap,HOMO(eV),LUMO(eV),Gap(eV)
0,gdb 1,C,-0.3877,0.1171,0.5048,-10.549854,3.186453,13.736308
1,gdb 2,N,-0.2570,0.0829,0.3399,-6.993326,2.255824,9.249150
2,gdb 3,O,-0.2928,0.0687,0.3615,-7.967494,1.869422,9.836916
3,gdb 4,C#C,-0.2845,0.0506,0.3351,-7.741639,1.376896,9.118535
4,gdb 5,C#N,-0.3604,0.0191,0.3796,-9.806984,0.519737,10.329442
...,...,...,...,...,...,...,...,...
133880,gdb 133881,C1C2C3C4C5OC14C5N23,-0.2254,0.0588,0.2842,-6.133446,1.600029,7.733476
133881,gdb 133882,C1N2C3C2C2C4OC12C34,-0.2393,0.0608,0.3002,-6.511685,1.654452,8.168858
133882,gdb 133883,C1N2C3C4C5C2C13CN45,-0.2233,0.0720,0.2953,-6.076302,1.959220,8.035522
133883,gdb 133884,C1N2C3C4C5CC13C2C45,-0.2122,0.0881,0.3003,-5.774256,2.397323,8.171579


In [None]:
#counter = 0
df1=df1[(df1['HOMO(eV)']< 0) &(df1['LUMO(eV)']<0)]
# # Utiliser une boucle for pour filtrer les données et les ajouter à la nouvelle DataFrame
# for i in range(len(df1)):
#     df=df1.loc[i, 'HOMO(eV)'] < 0 and df1.loc[i, 'LUMO(eV)'] < 0
#     df.loc[counter] = df1.loc[i]
#     counter += 1

# # Couper la DataFrame à la taille réelle des données filtrées
# df = df[:counter]

# # Afficher la nouvelle DataFrame filtrée
# print(df)
df1.to_csv('best_GDB9.csv', index=False)