In [78]:
#Script name: zzz.combine_headers
#    Purpose: *********** FOR GA XOVER RUNS *********** 
#             Combine the rescored and pre-rescored headers for parents and children of xover,
#             and ordering them so that combined parents come directly before a child.
#             Also includes the SMILES strings at the end of every header.



# Import modules for our exercise
import os              # operational system interface package

# Import modules for dealing with chemical information
from rdkit import Chem as Chem
from rdkit.Chem import AllChem as Chem2
from rdkit.Chem import Descriptors as Desc
from rdkit.Chem import rdmolfiles as RDFile

In [75]:
#Returns a list of all the lines of the molecules in a given multi-mol2
def extract_all_molecules(filename):
    line_list=[]
    molecule_list=[]
    with open(filename,"r") as f:
        for line in f:
            if "Name" in line:
                molecule_list.append(line_list)
                line_list=[]
                line_list.append(line)
            else:
                line_list.append(line)
    return(molecule_list)


# Function that reads multi-molecule MOL2 files. Adapted from:
# https://chem-workflows.com/articles/2019/07/18/building-a-multi-molecule-mol2-reader-for-rdkit/
# further adapted from function written by Guilherme Duarte, Rizzo Lab
def mol2_mol_supplier_loop(file):
    ''' This function extracts all the molecules in the multi-molecule
        MOL2 file `file` and returns a list of rdkit.Chem.rdchem.mol 
        object.
        
        Variable         I/O          dtype           default value?
        ------------------------------------------------------------
        file              I           string                  None
        mols              O           list                    N/A
        mols[i]           O           rdkit.Chem.rdchem.mol   N/A
        
    '''
    mols=[]
    mol_names=[]
    mol_descriptors={}
    compiled_descriptors=[]
    recording=False
    with open(file, 'r') as f:
        for line in f:
            
            # Determines if @<TRIPOS>MOLECULE is in line, which marks the start
            # of each molecule. Sets the recording variable to True, which is
            # the boolean to write each molecule.
            if "Group" in line:
                mol_names.append(line.split(":")[1].strip())
                recording=True
                mol = []
                
            if recording==True and "###" in line:
                mol_descriptors[line.split(":")[0].split()[1].strip()]=line.split(":")[1].strip()
                                
            # Determines if "ROOT" is in line, which marks the end of each
            # molecule. Records the line and sets the recording variable
            # to false.
            elif ("ROOT") in line:
                mol.append(line)
                recording=False
                
                # Makes final adjustments to the data. It must look
                # like the MOL2 file of a single molecule.
                block = ",".join(mol).replace(',','')
                
                # Converts the data of a single molecule to a 
                # rdkit.Chem.rdchem.mol object.
                m=Chem.MolFromMol2Block(block,
                                        sanitize=False,
                                        cleanupSubstructures=False)
                mols.append(m)
                compiled_descriptors.append(mol_descriptors)
                mol_descriptors={}
                continue
                
            if recording==True:
                mol.append(line)
                
        return(mols,mol_names,compiled_descriptors)
    
#Combines the headers of two files - only pulls some lines from the original mol2
#and takes everything else from the rescored file
def combined_headers(original_file,rescored_file,outfile):

    ###EXTRACT ORIGINAL DOWN TO GRID_SCORE, EXCLUDING DESCRIPTOR_SCORE
    ###THEN JUST PASTE EVERYTHING BELOW DESCRIPTOR SCORE FROM RESCORED
    original_molecules=[]
    rescored_mols=extract_all_molecules(rescored_file)
    original_mols=extract_all_molecules(original_file)
    
    #Checks to make sure that the files are correctly ordered
    if len(rescored_mols) != len(original_mols):
        print("MISMATCH IN NUMBER OF MOLECULES - CHECK THAT YOU'RE USING THE RIGHT FILES")


    with open(outfile,"w") as of:
        gather_header=[]
        record_desc=False
        with open(original_file,"r") as f:
            for line in f:
                if "Name" in line:
                    record_desc=True
                if "Descriptor_Score" in line:
                    continue
                elif "Pharmacophore_Score" in line:
                    original_molecules.append(gather_header)
                    gather_header=[]
                    record_desc=False
                elif record_desc==True:
                    gather_header.append(line)
        with open(rescored_file,"r") as f:
            molecule_counter=0
            for line in f:
                if "Name" in line:
                    record_desc=False
                    for entry in original_molecules[molecule_counter]:
                        of.write(entry)
                    molecule_counter+=1
                if "Descriptor_Score" in line:
                    record_desc = True
                if "Molecular_Weight" in line:
                    of.write(line)

                if record_desc == True:
                    of.write(line)
                
##takes all of the parents and moves them before the child in a multimol2,
##and renames the child molecule to be the crossover name (p1_X_p2)
def order_parents_and_children(parent_filename,child_filename,outfile):
    ##Orders everything as parent, parent, child, repeat.
    parent_list=extract_all_molecules(parent_filename)
    child_list=extract_all_molecules(child_filename)

    with open(outfile,"w") as of:
        grouped_mol_counter=0
        for entry in child_list:
            parent_line=""
            parent_pair=[]
            for line in entry:
                if "Parents" in line:
                    broken_line=line.split(":")
                    parent_pair=[broken_line[1].split("_X_")[0].strip(),broken_line[1].split("_X_")[1].strip()]
                    parents_found=0
                    for parent in parent_list:
                        if parents_found==2:
                            of.write("##########                               Group: %s\n" % (grouped_mol_counter))
                            of.write(("##########                                Name: %s_X_%s\n") % (str(parent_pair[0]),str(parent_pair[1])))
                            for childwriteline in entry:
                                if "Name" in childwriteline:
                                    continue
                                of.write(childwriteline)
                            grouped_mol_counter+=1
                            break

                        for pline in parent:
                            if "Name" in pline:
                                if (parent_pair[0] in pline):
                                    #print(parent_pair[0])
                                    parents_found+=1
                                    of.write("##########                               Group: %s\n" % (grouped_mol_counter))
                                    for pwriteline in parent:
                                        of.write(pwriteline)

                                if (parent_pair[1] in pline):
                                    #print(parent_pair[1])
                                    parents_found+=1
                                    of.write("##########                               Group: %s\n" % (grouped_mol_counter))
                                    for pwriteline in parent:
                                        of.write(pwriteline)
                                if parents_found==2:
                                    break

#Calculates the smiles strings for all molecules in a multimol2 and appends them to the
#end of the header.
def add_smiles_strings(pre_smiles_filename,outfile):

    mols, mol_names, compiled_descriptors=mol2_mol_supplier_loop(pre_smiles_filename)

    smiles_list=[]

    for mol in mols:
        smiles_list.append(Chem.MolToSmiles(mol))

    molecules_raw=[]
    collected_lines=[]

    with open(pre_smiles_filename,"r") as f:
        record=False
        for line in f:
            if "Group" in line:
                record=True
            if record==True:
                collected_lines.append(line)
            if "ROOT" in line:
                record=False
                molecules_raw.append(collected_lines)
                collected_lines=[]

    with open(outfile,"w") as of:
        for x in range (0,len(molecules_raw)):
            for y in range(0,len(molecules_raw[x])):
                if molecules_raw[x][y] == "\n":
                    if "####" in molecules_raw[x][y-1]:
                        of.write("##########                              SMILES: " + smiles_list[x])

                of.write(molecules_raw[x][y])

In [77]:
#def combined_headers(original_file,rescored_file,outfile):
#def order_parents_and_children(parent_filename,child_filename,outfile):
#def add_smiles_strings(pre_smiles_filename,outfile):


################ PARENTS COMBINE
p_rescored_file="parents_rescored.mol2_scored.mol2"
p_original_file="6m71_purchasable.restart0001.mol2"
p_outfile_combined="parents_mols_combined_scores.mol2"

################ CHILDREN COMBINE
c_rescored_file="crossover_molecules_rescored.mol2_scored.mol2"
c_original_file="6m71_purchasable_unique_crossovergen.mol2"
c_outfile_combined="xover_mols_combined_scores.mol2"

################ ADDING SMILES
pre_smiles_file="zzz.ordered_parents_and_children.mol2"
post_smiles_file="zzz.ordered_parents_and_children_with_smiles.mol2"

################ CALLING THE FUNCTIONS
combined_headers(p_original_file,p_rescored_file,p_outfile_combined)
combined_headers(p_original_file,p_rescored_file,p_outfile_combined)
order_parents_and_children(p_outfile_combined,c_outfile_combined,pre_smiles_file)
add_smiles_strings(pre_smiles_file,post_smiles_file)