# Preprocessing Function

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from scipy.constants import e

def calculate_amino_acid_center_of_mass(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            try:
                amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSequence(aa)))
            except:
                return 0
                break

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_amino_acid_center_of_mass_smiles(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSmiles(aa)))

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_distance_between_amino_acids(aa1, aa2):
    # Menghitung jarak antara dua pusat massa asam amino
    distance = abs(aa1 - aa2)
    return distance

In [2]:
from rdkit import Chem
from collections import Counter

def get_bond_types(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    bond_types = []
    for bond in mol.GetBonds():
        bond_type = bond.GetBondTypeAsDouble()
        bond_types.append(bond_type)

    # Menghitung frekuensi tipe ikatan
    bond_type_counts = Counter(bond_types)

    return bond_type_counts

In [3]:
from rdkit import Chem
from collections import Counter

def count_atoms(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    # Menghitung jumlah atom-atom
    atom_counts = Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

    return atom_counts

In [4]:
def seq_to_smiles(seq):
    try:
        mol = Chem.MolFromSequence(seq)
        smiles = Chem.MolToSmiles(mol,kekuleSmiles=True)
        return str(smiles)
    except:
        return None

def inchi_to_smiles(inchi):
    try:
        molecule = Chem.MolFromInchi(inchi)
        smiles = Chem.MolToSmiles(molecule,kekuleSmiles=True)
        return smiles
    except:
        return None

In [5]:
from rdkit import Chem
from rdkit.Chem import MolFromSmiles, MolToSequence

def smiles_to_protein_sequence(smiles):
    try:
        mol = MolFromSmiles(smiles)
        if mol:
            # Ubah molekul menjadi sequence
            protein_sequence = MolToSequence(mol)
            return protein_sequence
        else:
            return None
    except Exception as e:
        print("Error:", str(e))
        return None

# Contoh penggunaan
smiles = "CC(C)(C)OC(O)=N[C@@H](CC1=CC=C(OCCN2CCOCC2)C=C1)[C@@H](O)C[C@@H](CC1=CC=CC=C1)C(O)=N[C@H]1C2=CC=CC=C2C[C@H]1O"
protein_sequence_result = smiles_to_protein_sequence(smiles)
print("Protein Sequence Result:", protein_sequence_result)


Protein Sequence Result: 


In [2]:
from rdkit.Chem import rdDistGeom
import rdkit.DistanceGeometry as DG

mol = Chem.MolFromSmiles("C1CCC1C")
mol = Chem.AddHs(mol)
bm = rdDistGeom.GetMoleculeBoundsMatrix(mol)
bm[0,3] = 1.21
bm[3,0] = 1.20
bm[2,3] = 1.21
bm[3,2] = 1.20
bm[4,3] = 1.21
bm[3,4] = 1.20
DG.DoTriangleSmoothing(bm)

params.SetBoundsMat(bm)

In [20]:
df = pd.read_csv(batch_file[6777])

In [27]:
df['Ligand SMILES'][4]

'CC1(C)CCC(CN2CCN(C3=CC(OC4=CNC5=NC=CC5=C4)=C(C(O)=NS(=O)(=O)C4=CC([N+](=O)[O-])=C(NCC5CCOCC5)C=C4)C=C3)CC2)=C(C23CC(C)(C2)C3)C1'

In [28]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Contoh molekul (ubah sesuai dengan kebutuhan)
smiles = 'CC1(C)CCC(CN2CCN(C3=CC(OC4=CNC5=NC=CC5=C4)=C(C(O)=NS(=O)(=O)C4=CC([N+](=O)[O-])=C(NCC5CCOCC5)C=C4)C=C3)CC2)=C(C23CC(C)(C2)C3)C1'

# Membuat objek Molecule dari smiles
mol = Chem.MolFromSmiles(smiles)

# Menambahkan hidrogen ke molekul
mol = Chem.AddHs(mol)

# Menggunakan ETKDGv3 untuk generasi conformer
params = AllChem.ETKDGv3()
params.useSmallRingTorsions = True  # Menggunakan small ring torsions
params.useRandomCoords = True
params.SetCPCI({ (0,3) : 0.9 } )

# Generasi conformer
AllChem.EmbedMultipleConfs(mol, numConfs=3, params=params)

# Menampilkan hasil generasi conformer
for i in range(3):
    print(f"Conformer {i + 1}:")
    for atom in mol.GetConformer(i).GetPositions():
        print(atom)
    print("\n")


Conformer 1:


ValueError: Bad Conformer Id

In [6]:
from rdkit.Chem import AllChem

def generate_conformer(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    mol = Chem.AddHs(mol)  # Add hydrogens for a more accurate 3D structure
    conformer = AllChem.EmbedMolecule(mol, useRandomCoords=True, randomSeed=42)  # Generate a conformer
    return mol


In [7]:
from rdkit import Chem
from rdkit.Chem import rdMolTransforms

def calculate_molecular_center_of_mass(smiles):
    molecule = generate_conformer(smiles)
    try:
        if molecule is None:
            return None

        # Hitung pusat massa molekul
        center_of_mass = rdMolTransforms.ComputeCentroid(molecule.GetConformer())
        total_mass = Descriptors.MolWt(molecule)
        # print(center_of_mass)
        # print(total_mass)
        center_of_mass = sum([center_of_mass[i] * total_mass for i in range(len(center_of_mass))]) / total_mass
        # print(total_mass)
        
        return center_of_mass
    except Exception as e:
        print("Error:", str(e))
        return None


In [8]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_weight(molecule_smiles):
    try:
        mol = generate_conformer(molecule_smiles)
        if mol is None:
            print("Gagal membaca molekul.")
            return None

        # Menghitung massa molekul
        molecular_weight = Descriptors.MolWt(mol)

        return molecular_weight
    except:
        return None

In [9]:
import molecular_scoring

MS = molecular_scoring.ms()

In [10]:
import pandas as pd
import os
import warnings

# Menonaktifkan semua warnings
warnings.filterwarnings('ignore')

# Main Preprocessing

In [11]:
batch_dir = "data_batch/"
batch_file = [batch_dir+i for i in os.listdir(batch_dir)]

In [12]:
def preprocessing(name_file, j):
    df = pd.read_csv(name_file)
    df['Ligand SMILES'] = [inchi_to_smiles(df['Ligand InChI'][i]) for i in range(len(df))]
    # df['Center Of Mass Target'] = [calculate_amino_acid_center_of_mass(str(df['BindingDB Target Chain Sequence'][i])) for i in range(len(df))]
    df['Center Of Mass Ligand'] = [calculate_amino_acid_center_of_mass_smiles(str(df['Ligand SMILES'][i])) for i in range(len(df))]
    df['Distance'] = [calculate_distance_between_amino_acids(df['Center Of Mass Ligand'][i], df['Center Of Mass Target'][i]) for i in range(len(df))]
    print('4')
    df['Ligand Weight'] = [calculate_molecular_weight(str(df['Ligand SMILES'][i])) for i in range(len(df))]
    print('5')
    df['Target SMILES'] = [seq_to_smiles(df['BindingDB Target Chain Sequence'][i]) for i in range(len(df))]
    print('6')
    df['Target Weight'] = [calculate_molecular_weight(str(df['Target SMILES'][i])) for i in range(len(df))]
    print('7')
    df['Attractive'] = [MS.attractive_energy(df['Distance'][i]) for i in range(len(df))]
    print('8')
    df['Repulsive'] = [MS.repulsive_energy(df['Distance'][i]) for i in range(len(df))]
    print('9')
    df['LJ force(eV/Å)'] = [MS.lj_force(df['Distance'][i]) for i in range(len(df))]
    print('10')
    df['Coulomb Energy'] =[MS.coulomb_energy(e, e, df['Distance'][i]) for i in range(len(df))]
    print('11')

    output_file = f'data_batch_preprocessing/output_batch_{j}.csv'
    df.to_csv(output_file, index=False)
    print(output_file)

In [12]:
begin = 6778
for j in range(begin,len(batch_file)):
    df = pd.read_csv(batch_file[j])
    print('0')
    df['Ligand SMILES'] = [inchi_to_smiles(df['Ligand InChI'][i]) for i in range(len(df))]
    print('1')
#     df['Center Of Mass Target'] = [calculate_amino_acid_center_of_mass(str(df['BindingDB Target Chain Sequence'][i])) for i in range(len(df))]
#     print('2')
    df['Center Of Mass Ligand'] = [calculate_molecular_center_of_mass(str(df['Ligand SMILES'][i])) for i in range(len(df))]
#     print('3')
#     df['Distance'] = [calculate_distance_between_amino_acids(df['Center Of Mass Ligand'][i], df['Center Of Mass Target'][i]) for i in range(len(df))]
#     print('4')
    df['Ligand Weight'] = [calculate_molecular_weight(str(df['Ligand SMILES'][i])) for i in range(len(df))]
    print('5')
    # df['Target SMILES'] = [seq_to_smiles(df['BindingDB Target Chain Sequence'][i]) for i in range(len(df))]
    # print('6')
    # df['Target Weight'] = [calculate_molecular_weight(str(df['Target SMILES'][i])) for i in range(len(df))]
    # print('7')
    # df['Attractive'] = [MS.attractive_energy(df['Distance'][i]) for i in range(len(df))]
    # print('8')
    # df['Repulsive'] = [MS.repulsive_energy(df['Distance'][i]) for i in range(len(df))]
    # print('9')
    # df['LJ force(eV/Å)'] = [MS.lj_force(df['Distance'][i]) for i in range(len(df))]
    # print('10')
    # df['Coulomb Energy'] =[MS.coulomb_energy(e, e, df['Distance'][i]) for i in range(len(df))]
    # print('11')

    output_file = f'data_batch_preprocessing/output_batch_{j}.csv'
    df.to_csv(output_file, index=False)
    print(output_file)

0


1
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
5
data_batch_preprocessing/output_batch_6777.csv
0
1
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id
Error: Bad Conformer Id


In [17]:
import pandas as pd
df = pd.read_csv(batch_file[2])

In [22]:
df['Ligand InChI'][0]

'InChI=1S/C39H51N3O7/c1-39(2,3)49-38(46)40-33(24-28-13-15-31(16-14-28)48-22-19-42-17-20-47-21-18-42)34(43)26-30(23-27-9-5-4-6-10-27)37(45)41-36-32-12-8-7-11-29(32)25-35(36)44/h4-16,30,33-36,43-44H,17-26H2,1-3H3,(H,40,46)(H,41,45)/t30-,33+,34+,35-,36+/m1/s1'

In [5]:
tes_max = max([len(i) for i in df['BindingDB Target Chain Sequence']])
tes_max

99

In [7]:
for i in range(len(df)):
    if len(df['BindingDB Target Chain Sequence'][i]) == tes_max:
        print(df['BindingDB Target Chain Sequence'][i])

PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF
PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF


In [8]:
# DF = pd.read_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv')

# Target Preprocessing

In [8]:
df_target = pd.read_csv('BindingDB_Target.csv')

In [13]:
def flatten_list(nested_list):
    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_list(item))
        else:
            flattened_list.append(item)
    return flattened_list

In [14]:
all_seq = set(flatten_list([list(set(df_target['Target Sequence'][i])) for i in range(len(df_target))]))

In [17]:
len(all_seq)

56

In [18]:
from rdkit import Chem
from rdkit.Chem import MolFromSequence, MolToSmiles

def convert_protein_to_smiles(protein_sequence):
    # Pisahkan protein sequence menjadi bagian-bagian (misalnya, setiap 100 karakter)
    chunk_size = 100
    chunks = [protein_sequence[i:i+chunk_size] for i in range(0, len(protein_sequence), chunk_size)]
    
    # Ubah setiap bagian menjadi SMILES
    smiles_chunks = []
    for chunk in chunks:
        mol = MolFromSequence(chunk)
        if mol:
            smiles_chunk = MolToSmiles(mol)
            smiles_chunks.append(smiles_chunk)
    
    # Gabungkan hasil SMILES
    combined_smiles = ''.join(smiles_chunks)
    return combined_smiles

In [19]:
smiles_tes = convert_protein_to_smiles(df_target['Target Sequence'][42])

20072

In [10]:
# all_target = list(DF['BindingDB Target Chain Sequence'].unique())

In [11]:
# df_target = pd.DataFrame({'Target Sequence' : all_target})

In [12]:
# df_target.to_csv('BindingDB_Target.csv',index=False)

In [9]:
df_target.head()

Unnamed: 0,Target Sequence,Target SMILES
0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...
1,PQITLWKRPIVTVKIGGQLREALLDTGADDTVLEDINLPGKWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@@H](NC(=O)[C@@...
2,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C...
3,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...
4,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...


In [22]:
# begin = 0
# for i in range(begin, len(df_target)):
#     df_target['Target SMILES'][i] = convert_protein_to_smiles(df_target['Target Sequence'][i])
#     df_target.to_csv('BindingDB_Target.csv',index=False)

In [10]:
df_target['Target Weight'] = [calculate_molecular_weight(str(df_target['Target SMILES'][i])) for i in range(len(df_target))]
df_target.to_csv('BindingDB_Target.csv',index=False)

[09:41:35] SMILES Parse Error: syntax error while parsing: nan
[09:41:35] SMILES Parse Error: Failed parsing SMILES 'nan' for input: 'nan'


Gagal membaca molekul.


[09:54:03] SMILES Parse Error: syntax error while parsing: nan
[09:54:03] SMILES Parse Error: Failed parsing SMILES 'nan' for input: 'nan'


Gagal membaca molekul.


[09:54:45] Explicit valence for atom # 3151 O, 3, is greater than permitted


Gagal membaca molekul.


[10:05:16] Explicit valence for atom # 16641 O, 3, is greater than permitted


Gagal membaca molekul.


[10:07:21] Explicit valence for atom # 2454 O, 3, is greater than permitted


Gagal membaca molekul.


[10:09:01] Explicit valence for atom # 3148 O, 3, is greater than permitted


Gagal membaca molekul.


[10:22:59] SMILES Parse Error: syntax error while parsing: nan
[10:22:59] SMILES Parse Error: Failed parsing SMILES 'nan' for input: 'nan'


Gagal membaca molekul.


[10:28:11] Explicit valence for atom # 852 O, 3, is greater than permitted


Gagal membaca molekul.


[10:31:11] Explicit valence for atom # 2337 O, 3, is greater than permitted


Gagal membaca molekul.


[10:33:50] Explicit valence for atom # 2339 O, 3, is greater than permitted


Gagal membaca molekul.


[10:35:51] Explicit valence for atom # 2298 O, 3, is greater than permitted


Gagal membaca molekul.


[10:36:40] Explicit valence for atom # 2343 O, 3, is greater than permitted


Gagal membaca molekul.


In [11]:
df_target['Center Of Mass Target'] = [calculate_amino_acid_center_of_mass(str(df_target['Target Sequence'][i])) for i in range(len(df_target))]
df_target.to_csv('BindingDB_Target.csv',index=False)

In [13]:
df_target = df_target.dropna(subset="Target SMILES")

In [15]:
df_target.to_csv('BindingDB_Target.csv',index=False)