## Purpose of this notebook

Prior to analysis, we performed several steps to clean and format data files from experiment. The following notebook demonstrates the step-by-step process we took and provides many general-use functions that can be repurposed for other datasets. 

Input:
- `del_hits.csv`: 116K compounds with activity
- `del_inactives.csv`: 10M compounds without activity

Output:
- `total_compounds.csv`: combined file of 4.85M compounds
- `bb1_list.csv`: SMILES of all building blocks in position 1 of the library
- `bb2_list.csv`: SMILES of all building blocks in position 2 of the library
- `bb3_list.csv`: SMILES of all building blocks in position 3 of the library

In [1]:
import numpy as np
import pandas as pd
from openeye import oechem
from IPython.display import Image, display

#### Load in original datafiles

In [2]:
hits = pd.read_csv('../files/del_hits.csv')
inactives = pd.read_csv('../files/del_inactives.csv')
inactives = inactives.rename(columns={'count': 'read_count'})

# View how the information for our hit compounds is stored
display(hits.head(5))
print(len(hits))

# View how the information for our inactive compounds is stored
display(inactives.head(5))
print(len(inactives))

Unnamed: 0,bb1,bb2,bb3,structure,read_count
0,NCC1=C(F)C(Cl)=CC=C1F,COC(=O)C1=NN(CCC(=O)O)C=C1,NCCC(C1=CC=CC=C1)C1=CC=CC=C1,CN(Cc1c(F)ccc(Cl)c1F)C(=O)CCn1ccc(C(=O)NCCC(c2...,172
1,NCCOC1=CC=CC=C1,CCOC(=O)C1CCC(CC(=O)O)CC1,NCCC1CCCCCCC1,CN(CCOc1ccccc1)C(=O)CC1CCC(C(=O)NCCC2CCCCCCC2)CC1,611
2,CCCCCCN,COC(=O)COC1=C(C)C=C(C(=O)O)C=C1C,NCCC(C1=CC=C(F)C=C1)C1=CC=CO1,CCCCCCN(C)C(=O)c1cc(C)c(OCC(=O)NCCC(c2ccc(F)cc...,1401
3,NCC1=C(Cl)C=CC(F)=C1,COC(=O)C1=CSC2=C1SC(C(=O)O)=C2,CCOC1=CC=CC(CCN)=C1,CCOc1cccc(CCNC(=O)c2csc3cc(C(=O)N(C)Cc4cc(F)cc...,86
4,COC1=CC(C[C@H](NC(=O)OCC2C3=CC=CC=C3C3=C2C=CC=...,O=C(O)[C@H]1CC[C@H](C(=O)O)CC1,NCCC(C1=CC=C(F)C=C1)C1=CC=CO1,CNC(=O)[C@H](Cc1cccc(OC)c1)NC(=O)[C@H]1CC[C@H]...,380


116666


Unnamed: 0,bb1,bb2,bb3,structure,read_count
0,CC(C)(C)OC(=O)NC1CCC(N)CC1,CC1=CC(B2OC(C)(C)C(C)(C)O2)=CN=C1,CN1C(=O)C[C@@H](C(=O)O)NC1=O,CNc1ncc(-c2cncc(C)c2)c(NC2CCC(NC(=O)[C@@H]3CC(...,0
1,CCC(NC(=O)OCC1C2=C(C=CC=C2)C2=C1C=CC=C2)C(=O)O,COC(=O)C1=NC=C(C(=O)O)C=C1,CC[C@@H](N)CO,CCC(NC(=O)c1ccc(C(=O)N[C@H](CC)CO)nc1)C(=O)NC,0
2,CC(N)CN1CCCCC1,COC(=O)CC[C@@H]1CC[C@H](C(=O)O)O1,NCC1=CC(Cl)=C(C(F)(F)F)C=C1,CC(CN1CCCCC1)N(C)C(=O)[C@H]1CC[C@@H](CCC(=O)NC...,0
3,CN1CCC(CN)C1,O=C(O)C1=CN(C2CC2)C2=C(C=C(F)C(Cl)=C2)C1=O,COC1=CC(B(O)O)=CC=N1,COc1cc(-c2cc3c(cc2F)c(=O)c(C(=O)NCC2CCN(C)C2)c...,0
4,CC(C)(C)OC(=O)NC1CCNC1,CCNS(=O)(=O)C1=CC(C)=C(B2OC(C)(C)C(C)(C)O2)C=C1,O=C(O)CNC(=O)C1=C(F)C=CC=C1,CCNS(=O)(=O)c1ccc(-c2cnc(NC)nc2N2CCC(NC(=O)CNC...,0


10010000


#### Remove null entries and inactives with measured counts > 0

In [3]:
## First remove any building blocks that contain the word null
null_hits = hits[ (hits['bb1'].str.contains("Null|null")) | (hits['bb2'].str.contains("Null|null")) | (hits['bb3'].str.contains("Null|null")) ]
null_inactives = inactives[ (inactives['bb1'].str.contains("Null|null")) | (inactives['bb2'].str.contains("Null|null")) | (inactives['bb3'].str.contains("Null|null")) ]

## For inactive compounds, only consider compounds with 
count_off = inactives[inactives['read_count'] != 0]


In [4]:
## Left join dataframes and keep entries that are only in the left dataframe
hits_new = hits.merge(null_hits, how='left', indicator=True)
hits_new = hits_new.loc[hits_new['_merge'] == 'left_only']

inactives_new = inactives.merge(pd.concat([null_inactives, count_off]), how='left', indicator=True)
inactives_new = inactives_new.loc[inactives_new['_merge'] == 'left_only']

In [6]:
inactives_new

Unnamed: 0,bb1,bb2,bb3,structure,read_count,_merge
0,CC(C)(C)OC(=O)NC1CCC(N)CC1,CC1=CC(B2OC(C)(C)C(C)(C)O2)=CN=C1,CN1C(=O)C[C@@H](C(=O)O)NC1=O,CNc1ncc(-c2cncc(C)c2)c(NC2CCC(NC(=O)[C@@H]3CC(...,0,left_only
1,CCC(NC(=O)OCC1C2=C(C=CC=C2)C2=C1C=CC=C2)C(=O)O,COC(=O)C1=NC=C(C(=O)O)C=C1,CC[C@@H](N)CO,CCC(NC(=O)c1ccc(C(=O)N[C@H](CC)CO)nc1)C(=O)NC,0,left_only
2,CC(N)CN1CCCCC1,COC(=O)CC[C@@H]1CC[C@H](C(=O)O)O1,NCC1=CC(Cl)=C(C(F)(F)F)C=C1,CC(CN1CCCCC1)N(C)C(=O)[C@H]1CC[C@@H](CCC(=O)NC...,0,left_only
3,CN1CCC(CN)C1,O=C(O)C1=CN(C2CC2)C2=C(C=C(F)C(Cl)=C2)C1=O,COC1=CC(B(O)O)=CC=N1,COc1cc(-c2cc3c(cc2F)c(=O)c(C(=O)NCC2CCN(C)C2)c...,0,left_only
4,CC(C)(C)OC(=O)NC1CCNC1,CCNS(=O)(=O)C1=CC(C)=C(B2OC(C)(C)C(C)(C)O2)C=C1,O=C(O)CNC(=O)C1=C(F)C=CC=C1,CCNS(=O)(=O)c1ccc(-c2cnc(NC)nc2N2CCC(NC(=O)CNC...,0,left_only
...,...,...,...,...,...,...
10015682,N#CC1=CC(C[C@H](NC(=O)OCC2C3=CC=CC=C3C3=C2C=CC...,CC(C)(C)OC(=O)N(CCC(=O)O)CC1=CC=CC=C1,CC(CC(=O)O)C(F)(F)F,CNC(=O)[C@H](Cc1cccc(C#N)c1)NC(=O)CCN(Cc1ccccc...,0,left_only
10015683,CC(C)(C)OC(=O)NC1CCCNC1,CC1(C)OB(C2=CC3=C(C=C2)SC=N3)OC1(C)C,CC1=CN2C(C(=O)O)=C(C)N=C2C=C1,CNc1ncc(-c2ccc3scnc3c2)c(N2CCCC(NC(=O)c3c(C)nc...,0,left_only
10015684,O=C(N[C@H](CC1=CC=CS1)C(=O)O)OCC1C2=CC=CC=C2C2...,COC(=O)C1(C(=O)O)CC1C,NC1=C(OCCN2CCOCC2)C=CC=C1,CNC(=O)[C@@H](Cc1cccs1)NC(=O)C1(C(=O)Nc2ccccc2...,0,left_only
10015685,CCCC(NC(=O)OCC1C2=C(C=CC=C2)C2=C1C=CC=C2)C(=O)O,COC(=O)C1=CC=C(CCC(=O)O)[NH]1,CC(C)CN1CCC(CN)CC1,CCCC(NC(=O)CCc1ccc(C(=O)NCC2CCN(CC(C)C)CC2)[nH...,0,left_only


#### Remove compounds with building blocks containing boron

Our force fields cannot parameterize boron, meaning we are unable to generate conformers for compounds containing boron. 

In [27]:
comp_2B = hits.loc[hits['bb2'].str.contains('B')]
comp_2Br = hits.loc[hits['bb2'].str.contains('Br')]

In [31]:
def get_boron_ind(df):
    '''
    Returns the indices for compounds containing boron in any of its three building blocks
    '''
    # Pattern matching for boron by using the character ('B') also returns results with bromine ('Br')
    # so we need to remove that subset from the total results
    ind_1B = np.where(df['bb1'].apply(lambda x: 'B' in x) == True)[0]
    ind_1Br = np.where(df['bb1'].apply(lambda x: 'Br' in x) == True)[0]

    ind_2B = np.where(df['bb2'].apply(lambda x: 'B' in x) == True)[0]
    ind_2Br = np.where(df['bb2'].apply(lambda x: 'Br' in x) == True)[0]

    ind_3B = np.where(df['bb3'].apply(lambda x: 'B' in x) == True)[0]
    ind_3Br = np.where(df['bb3'].apply(lambda x: 'Br' in x) == True)[0]
    
    ind_1_B = set(ind_1B) - set(ind_1Br)
    ind_2_B = set(ind_2B) - set(ind_2Br)
    ind_3_B = set(ind_3B) - set(ind_3Br)
    
    return list(ind_1_B | ind_2_B | ind_3_B)

In [47]:
ind_1B = np.where(hits['bb1'].apply(lambda x: 'B' in x) == True)[0]
ind_1Br = np.where(hits['bb1'].apply(lambda x: 'Br' in x) == True)[0]

ind_2B = np.where(hits['bb2'].apply(lambda x: 'B' in x) == True)[0]
ind_2Br = np.where(hits['bb2'].apply(lambda x: 'Br' in x) == True)[0]

ind_3B = np.where(hits['bb3'].apply(lambda x: 'B' in x) == True)[0]
ind_3Br = np.where(hits['bb3'].apply(lambda x: 'Br' in x) == True)[0]

ind_1_B = set(ind_1B) - set(ind_1Br)
ind_2_B = set(ind_2B) - set(ind_2Br)
ind_3_B = set(ind_3B) - set(ind_3Br)

print(len(ind_1_B))
print(len(ind_2_B))
print(len(ind_3_B))

len(ind_1_B | ind_2_B | ind_3_B)

0
4290
5937


10227

In [49]:
len(get_boron_ind(hits.drop_duplicates()))

8831

In [32]:
ind_to_drop_hits = get_boron_ind(hits_new)
hits_dropped = hits_new.reset_index().drop(ind_to_drop_hits)

ind_to_drop_inactives = get_boron_ind(inactives_new)
inactives_dropped = inactives_new.reset_index().drop(ind_to_drop_inactives)

NameError: name 'hits_new' is not defined

#### Remove duplicate compounds

In [9]:
# Group by structure name and extract the index of the first entry with that structure
# (allows us to remove duplicates)
hits_dropped['RANK'] = hits_dropped.groupby('structure')['index'].rank(method='first', ascending=True)
total_hits = hits_dropped.loc[hits_dropped['RANK'] == 1.0]

inactives_dropped['RANK'] = inactives_dropped.groupby('structure')['index'].rank(method='first', ascending=True)
total_inactives = inactives_dropped.loc[inactives_dropped['RANK'] == 1.0]

In [10]:
print(len(total_hits))
print(len(total_inactives))

105543
4745191


### Deprotect building blocks
Source notebooks: 
- `notebooks/09_14_21_substructure_query_and_deprotecting.ipynb`
- `notebooks/11_18_21_deprotecting_inactive_bbs.ipynb`

A few different protecting groups were added in experiment to control building block reactions. Building blocks in the raw dataset were reported with these protecting groups attached, but we hope to avoid protecting groups from biasing similarity scoring methods. 

The groups we will deprotect are: Fmoc and nBoc for amine protection and methyl and ethyl esters for carboxylic acid protection. Although the following functions can generalize to any deprotecting reaction, as long as the proper SMIRKS string is provided.

For those unfamiliar with SMIRKS, they can be generated via the ChemDraw software
1. Draw the general reaction structure on ChemDraw
2. Use the Reaction Atom-Atom map (A->A symbol) to connect atoms in reactant to corresponding ones in product
3. Highlight the entire scheme and copy as SMILES to extract SMIRKS with relevant atoms given indices

Suggestion taken from answer given here: https://forum.knime.com/t/bioisosteric-replacement-using-smarts-knime-and-rdkit/27979/4

#### Generate list of unique building blocks 

In [11]:
# Sort by structure SMILES so that ordering stays consistent
total_hits = total_hits.sort_values(by='structure').drop(columns=['index']).reset_index(drop=True)
total_inactives = total_inactives.sort_values(by='structure').drop(columns=['index']).reset_index(drop=True)

In [12]:
# Create a dataframe of all unique building blocks
hit_bbs = set(total_hits['bb1']) | set(total_hits['bb2']) | set(total_hits['bb3'])
nohit_bbs = set(total_inactives['bb1']) | set(total_inactives['bb2']) | set(total_inactives['bb3'])

all_bbs = pd.DataFrame({'SMILES': list(hit_bbs | nohit_bbs)})

In [13]:
def pg_query(compound_SMILES, pg):
    '''
    Returns a list of SMILES of all compounds containing the protecting group of interest
    '''
    ss = oechem.OESubSearch(pg)
    
    found_mols = []
    for smi in compound_SMILES:
        mol = oechem.OEGraphMol()
        oechem.OESmilesToMol(mol, smi)
        oechem.OEPrepareSearch(mol, ss)
        if ss.SingleMatch(mol):
            found_mols.append(smi)
        mol.Clear()
    return found_mols

def has_pg(compound_SMILES, pg):
    '''
    Returns True if the SMILES string contains the protecting group of interest
    '''
    ss = oechem.OESubSearch(pg)
    mol = oechem.OEGraphMol()
    oechem.OESmilesToMol(mol, compound_SMILES)
    oechem.OEPrepareSearch(mol, ss)
    return ss.SingleMatch(mol)

def deprotectGroup(compound_smi, pg_SMIRKS):
    '''
    Returns the SMILES of the deprotected compound after SMIRKS reaction. If the protecting group is not present
    in the compound, the input SMILES is returned.
    '''
    libgen = oechem.OELibraryGen(pg_SMIRKS)
    libgen.SetValenceCorrection(True)

    ## Rewrite the SMILES to remove kekulization for Fmoc specifically
    mol = oechem.OEGraphMol()
    oechem.OESmilesToMol(mol, compound_smi)
    rewrite_smi = oechem.OECreateIsoSmiString(mol)
    
    new_mol = oechem.OEGraphMol()
    oechem.OEParseSmiles(new_mol, rewrite_smi)
    libgen.SetStartingMaterial(new_mol, 0)

    if libgen.NumPossibleProducts() > 0:
        for product in libgen.GetProducts():
            new_smi = oechem.OECreateIsoSmiString(product)
            
        ## If a different pattern than expected got caught by the query and split
        ## we would prefer to just leave that compound as is
        if '.' in new_smi:
            return rewrite_smi
        else:
            return new_smi
            
    return rewrite_smi


In [14]:
# SMILES and SMIRKS patterns extracted by drawing functional groups and reactions in ChemDraw
fmoc = 'O=C([N])OCC1c2ccccc2c3ccccc31'
nboc = 'CC(C)(C)OC([N])=O'
methyl_ester = 'C(OC)=O'
ethyl_ester = 'C(OCC)=O'

subsearch_patterns = {'fmoc': fmoc, 'nboc': nboc, 'methyl_ester': methyl_ester, 'ethyl_ester': ethyl_ester}

nboc_SMIRKS = 'O=C(OC(C)(C)C)[N:1]>>[N:1]'
fmoc_SMIRKS = 'O=C([N:1])OCC1c2ccccc2c3ccccc31>>[N:1]'
methyl_ester_SMIRKS = '[C:1]([O:3]C)=[O:2]>>[C:1]([OH:3])=[O:2]'
ethyl_ester_SMIRKS = '[C:1]([O:3]CC)=[O:2]>>[C:1]([OH:3])=[O:2]'

SMIRKS_patterns = {'nboc': nboc_SMIRKS, 'fmoc': fmoc_SMIRKS, 'methyl_ester': methyl_ester_SMIRKS, 'ethyl_ester': ethyl_ester_SMIRKS}


In [15]:
def return_deprotect(data, subsearch, SMIRKS):
    '''
    Returns a table of all SMILES with an additional column of their deprotected SMILES
    '''
    table = pd.DataFrame(columns=['SMILES', 'nBoc', 'Fmoc', 'ethyl_ester', 'methyl_ester', 'deprot_SMILES'])
    table['SMILES'] = data['SMILES']
    table['deprot_SMILES'] = table['SMILES']
    
    ## Record whether a given building block has each of the following protecting groups
    table['nBoc'] = table['SMILES'].apply(lambda x: has_pg(x, subsearch['nboc']))
    table['Fmoc'] = table['SMILES'].apply(lambda x: has_pg(x, subsearch['fmoc']))
    table['ethyl_ester'] = table['SMILES'].apply(lambda x: has_pg(x, subsearch['ethyl_ester']))
    table['methyl_ester'] = table['SMILES'].apply(lambda x: has_pg(x, subsearch['methyl_ester']))
    
    ## Sequentially try to deprotect groups and save finalized structure to the column "deprot_SMILES"
    nboc_deprot = table.loc[table['nBoc']]['deprot_SMILES'].apply(lambda x: deprotectGroup(x, SMIRKS['nboc']))
    table.loc[table['nBoc'], 'deprot_SMILES'] = nboc_deprot

    fmoc_deprot = table.loc[table['Fmoc']]['deprot_SMILES'].apply(lambda y: deprotectGroup(y, SMIRKS['fmoc']))
    table.loc[table['Fmoc'], 'deprot_SMILES'] = fmoc_deprot

    ethyl_deprot = table.loc[table['ethyl_ester']]['deprot_SMILES'].apply(lambda w: deprotectGroup(w, SMIRKS['ethyl_ester']))
    table.loc[table['ethyl_ester'], 'deprot_SMILES'] = ethyl_deprot

    methyl_deprot = table.loc[table['methyl_ester']]['deprot_SMILES'].apply(lambda z: deprotectGroup(z, SMIRKS['methyl_ester']))
    table.loc[table['methyl_ester'], 'deprot_SMILES'] = methyl_deprot
    
    return table


In [16]:
# Call function to return a dataframe with SMILES of deprotected building blocks
all_bbs_deprot = return_deprotect(all_bbs, subsearch_patterns, SMIRKS_patterns)
display(all_bbs_deprot.head(5))
print(len(all_bbs_deprot))

Unnamed: 0,SMILES,nBoc,Fmoc,ethyl_ester,methyl_ester,deprot_SMILES
0,CC1=NN=C(C2=CC=C(N)C=C2)[NH]1,False,False,False,False,CC1=NN=C(C2=CC=C(N)C=C2)[NH]1
1,O=C(O)C1=C(C2=CC=C(Cl)C=C2)OC=N1,False,False,False,False,O=C(O)C1=C(C2=CC=C(Cl)C=C2)OC=N1
2,COC(=O)C1=C2C=CC=C(C(=O)O)C2=CC=C1,False,False,False,True,c1cc2c(cccc2C(=O)O)c(c1)C(=O)O
3,NCC(C1=C(F)C=CC=C1)N1CCCC1,False,False,False,False,NCC(C1=C(F)C=CC=C1)N1CCCC1
4,COC1=CC2=C(C=C1)C(N)CC(C)(C)O2,False,False,False,False,COC1=CC2=C(C=C1)C(N)CC(C)(C)O2


5142


In [17]:
### Check that all compounds with a PG are successfully deprotected
table = all_bbs_deprot
print('for all BBs in hit compounds')
print('nBoc check:', len(np.where((table.loc[table['nBoc'], 'SMILES'] == table.loc[table['nBoc'], 'deprot_SMILES']) == True)[0]))
print('Fmoc check:', len(np.where((table.loc[table['Fmoc'], 'SMILES'] == table.loc[table['Fmoc'], 'deprot_SMILES']) == True)[0]))
print('methyl_ester check:', len(np.where((table.loc[table['methyl_ester'], 'SMILES'] == table.loc[table['methyl_ester'], 'deprot_SMILES']) == True)[0]))
print('ethyl_ester check:', len(np.where((table.loc[table['ethyl_ester'], 'SMILES'] == table.loc[table['ethyl_ester'], 'deprot_SMILES']) == True)[0]))


for all BBs in hit compounds
nBoc check: 0
Fmoc check: 0
methyl_ester check: 0
ethyl_ester check: 0


#### Combine active and inactive compounds and merge deprotected BBs into dataframe

In [18]:
# Merge cleaned active and inactive compounds into a single dataframe
total_compounds = pd.concat([total_hits, total_inactives])
total_compounds = total_compounds.drop(columns=['_merge', 'RANK']).reset_index(drop=True)
display(total_compounds.head(5))
print(len(total_compounds))

Unnamed: 0,bb1,bb2,bb3,structure,read_count
0,C=CCC(O)(CN)CC=C,COC(=O)C1(C)CC(C(=O)O)C1,NCCC(C1=CC=C(F)C=C1)C1=CC=CO1,C=CCC(O)(CC=C)CN(C)C(=O)C1CC(C)(C(=O)NCCC(c2cc...,198
1,C=CCC(O)(CN)CC=C,COC(=O)C1(C)CC(C(=O)O)C1,NCCC(C1=CC=CC=C1)C1=CC=CC=C1,C=CCC(O)(CC=C)CN(C)C(=O)C1CC(C)(C(=O)NCCC(c2cc...,121
2,C=CCC(O)(CN)CC=C,COC(=O)C1C(F)(F)C12CC(C(=O)O)C2,NCCC(C1=CC=C(F)C=C1)C1=CC=CO1,C=CCC(O)(CC=C)CN(C)C(=O)C1CC2(C1)C(C(=O)NCCC(c...,85
3,C=CCC(O)(CN)CC=C,COC(=O)C1CC2(CC(C(=O)O)C2)C1,CC(C)CC1(CN)CCC1,C=CCC(O)(CC=C)CN(C)C(=O)C1CC2(CC(C(=O)NCC3(CC(...,108
4,C=CCC(O)(CN)CC=C,COC(=O)C1CC2(CC(C(=O)O)C2)C1,NCC1(C2=CC=C(Cl)C=C2)CCC1,C=CCC(O)(CC=C)CN(C)C(=O)C1CC2(CC(C(=O)NCC3(c4c...,1587


4850734


In [22]:
# Replace the SMILES of the building blocks at each position with the deprotected version (if applicable)
# In each line, we match up the SMILES of the BB at that position with the SMILES of the deprotected version of that BB
total_deprot = pd.merge(total_compounds[['bb1', 'bb2', 'bb3', 'structure', 'read_count']], all_bbs_deprot[['SMILES', 'deprot_SMILES']], how='left', left_on='bb1', right_on='SMILES')\
        .drop(columns=['SMILES', 'bb1']).rename(columns={'deprot_SMILES': 'bb1'})\
        .merge(all_bbs_deprot[['SMILES', 'deprot_SMILES']], left_on='bb2', how='left', right_on='SMILES')\
        .drop(columns=['SMILES', 'bb2']).rename(columns={'deprot_SMILES': 'bb2'})\
        .merge(all_bbs_deprot[['SMILES', 'deprot_SMILES']], left_on='bb3', how='left', right_on='SMILES')\
        .drop(columns=['SMILES', 'bb3']).rename(columns={'deprot_SMILES': 'bb3'})

total_deprot.to_csv('../../4_calculating_pactive_values/files/total_compounds.csv', index=False)
        

#### Export list of unique BBs at each position for conformer generation

In [20]:
bb1_list = pd.DataFrame({'SMILES': np.unique(total_deprot['bb1'])})
bb2_list = pd.DataFrame({'SMILES': np.unique(total_deprot['bb2'])})
bb3_list = pd.DataFrame({'SMILES': np.unique(total_deprot['bb3'])})

In [21]:
# These output files will be the inputs for the next step, so we can save into that directory
bb1_list.to_csv('../../2_generating_conformers/files/bb1_list.csv', index=False)
bb2_list.to_csv('../../2_generating_conformers/files/bb2_list.csv', index=False)
bb3_list.to_csv('../../2_generating_conformers/files/bb3_list.csv', index=False)