In [185]:
# List of SMILES
cmps = {}
for line in open('smiles.txt','r'):
    tmp = line.strip().split('\t')
    cmps[tmp[0]] = tmp[1]
cmps

{'ACT-451840': 'CC(C)(C)C1=CC=C(C=C1)S(=O)(=O)NC2=C(C(=NC(=N2)C3=NC=CC=N3)OCCO)OC4=CC=CC=C4OC',
 'AN10248': 'OBr1OCC2=CC=C(OC3=CN=C(C(O)=O)C=N3)C(C)=C21',
 'AN13762': 'B1(C2=C(CO1)C=CC(=C2C)OC3=NC=C(N=C3)C(=O)N4CC(C4)(C)O)O',
 'AN13956': 'OBr1OCC2=CC=C(OC3=CN=C(C(N4CC(C(O)CC)C4)=O)C=N3)C(C)=C21',
 'Antimycin A': 'CCCCCCC1C(C(OC(=O)C(C(OC1=O)C)NC(=O)C2=C(C(=CC=C2)NC=O)O)C)OC(=O)CC(C)C',
 'Artemisinin': 'CC1CCC2C(C(=O)OC3C24C1CCC(O3)(OO4)C)C',
 'Atovaquone': 'OC1=C([C@H]2CC[C@@H](CC2)C2=CC=C(Cl)C=C2)C(=O)C2=CC=CC=C2C1=O',
 'BCH070': 'C1=CC=C(C=C1)OC2=CC=C(C=C2)NC3=NC(=NC4=CC=CC=C43)C5=CC=C(C=C5)Cl',
 'BI-2536': 'CCC1C(=O)N(C2=CN=C(N=C2N1C3CCCC3)NC4=C(C=C(C=C4)C(=O)NC5CCN(CC5)C)OC)C',
 'BMS983970': 'C1CC1CC(C(CCC(F)(F)F)C(=O)NC2C(=O)NC3=C(C=CC=C3F)C(=N2)C4=CC=CC=C4)C(=O)N',
 'BRD1095': 'COC1=CC=C(NC(=O)N2CCCCN3[C@H](CN)[C@@H]([C@@H]3C2)C2=CC=C(C=C2)C#CC2=CC=CC=C2)C=C1',
 'BRD3444': 'COC1=CC=C(C=C1)NC(=O)N2CCCCN3C(C2)C(C3CO)C4=CC=C(C=C4)C#CC5=CC=CC=C5',
 'Bortezomib': 'B(C(CC(C)C)NC(=O)C(C

## RDKIT functions

In [247]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import MCS
from rdkit.Chem.rdFMCS import FindMCS
from rdkit.Chem import rdFMCS

In [None]:
def getMWfromSMI(smi):
    mw = Descriptors.ExactMolWt(Chem.MolFromSmiles(smi))
    return mw

In [None]:
def getCanonical(smiset):
    new_smiset = []
    
    for smi in smiset:
        mol = Chem.MolFromSmiles(smi)
        molsmi = Chem.MolToSmiles(mol,True)
        new_smiset.append(molsmi)
    
    return(new_smiset)

In [237]:
def getTanimotoSameSet(smiset):
    #smiset = [cmps[smi] for smi in snps['PF3D7_0321900']]
    if len(smiset) == 1:
        return(1)
    mols = [Chem.MolFromSmiles(smi, sanitize=False) for smi in smiset]
    fpset = [Chem.RDKFingerprint(mol) for mol in mols]
    simils = []
    for i in range(len(fpset)):
        for j in range(i+1,len(fpset)):
            score = tanimoto_calc(smiset[i],smiset[j])
            if score >= 0.95:
                simils.append(mol1,mol2,score)
    
    return(sum(simils)/len(simils))

In [None]:
## Fingerprint generation
# Morgan Fingerprint
# https://www.rdkit.org/docs/GettingStartedInPython.html
def getMorganFingerprint4(ms):
	from rdkit.Chem import rdMolDescriptors
	fp = [rdMolDescriptors.GetMorganFingerprint(x,4,nBits=2048,useFeatures=True) for x in ms]
	
	return fp
	
# ECFP Fingerprint with Features and bit
def getMorganBit(ms):
	from rdkit.Chem import AllChem
	fp = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1048,useFeatures=True) for x in ms]

	return fp

# ECFP Morgan Fingerprint with Features no bit
def getMorgan(ms):
	from rdkit.Chem import AllChem
	fp = [AllChem.GetMorganFingerprint(x,4,useFeatures=True) for x in ms] 

In [174]:
def similarity_calc(smi1, smi2):
    mol1 = Chem.MolFromSMILES(smi1)
    mol2 = Chem.MolFromSMILES(smi2)
    fp1 = Chem.RDKFingerprint(mol1)
    fp2 = Chem.RDKFingerprint(mol2)
    
    #fp1 = getMorgan(mol1)
    #fp2 = getMorgan(mol2)
    
    s = DataStructs.TanimotoSimilarity(fp1,fp2)
    x = 0
    
    s2 = DataStructs.FingerprintSimilarity(fp1,fp2, metric=DataStructs.DiceSimilarity)
    if s2 > s:
        s = s2
    try:
        fps = [MACCSkeys.GenMACCSKeys(mol1),MACCSkeys.GenMACCSKeys(mol2)]
        s2 = DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)
        if s2 > s :
            s = s2
    except:
        x += 1
    s2 = DataStructs.FingerprintSimilarity(fp1,fp2,metric=DataStructs.TanimotoSimilarity)
    if s2 > s:
        s = s2
    try:
        ffp1 = AllChem.GetMorganFingerprint(mol1,4,useFeatures=True)
        ffp2 = AllChem.GetMorganFingerprint(mol2,4,useFeatures=True)
        s2 = DataStructs.DiceSimilarity(ffp1,ffp2)
        if s2 > s:
            s = s2
    except:
        x += 1
    return s

* Modificar nombre variables -para que sean consistentes entre funciones/entradas
* Agregar canonizacion de smiles antes de ejecutar funciones