# Preprocessing Function

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from scipy.constants import e

def calculate_amino_acid_center_of_mass(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            try:
                amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSequence(aa)))
            except:
                return 0
                break

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_amino_acid_center_of_mass_smiles(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSmiles(aa)))

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_distance_between_amino_acids(aa1, aa2):
    # Menghitung jarak antara dua pusat massa asam amino
    distance = abs(aa1 - aa2)
    return distance

In [2]:
from rdkit import Chem
from collections import Counter

def get_bond_types(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    bond_types = []
    for bond in mol.GetBonds():
        bond_type = bond.GetBondTypeAsDouble()
        bond_types.append(bond_type)

    # Menghitung frekuensi tipe ikatan
    bond_type_counts = Counter(bond_types)

    return bond_type_counts

In [3]:
from rdkit import Chem
from collections import Counter

def count_atoms(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    # Menghitung jumlah atom-atom
    atom_counts = Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

    return atom_counts

In [4]:
def seq_to_smiles(seq):
    try:
        mol = Chem.MolFromSequence(seq)
        smiles = Chem.MolToSmiles(mol,kekuleSmiles=True)
        return str(smiles)
    except:
        return None

def inchi_to_smiles(inchi):
    try:
        molecule = Chem.MolFromInchi(inchi)
        smiles = Chem.MolToSmiles(molecule,kekuleSmiles=True)
        return smiles
    except:
        return None

In [5]:
from rdkit import Chem
from rdkit.Chem import MolFromSmiles, MolToSequence

def smiles_to_protein_sequence(smiles):
    try:
        mol = MolFromSmiles(smiles)
        if mol:
            # Ubah molekul menjadi sequence
            protein_sequence = MolToSequence(mol)
            return protein_sequence
        else:
            return None
    except Exception as e:
        print("Error:", str(e))
        return None

# Contoh penggunaan
smiles = "CC(C)(C)OC(O)=N[C@@H](CC1=CC=C(OCCN2CCOCC2)C=C1)[C@@H](O)C[C@@H](CC1=CC=CC=C1)C(O)=N[C@H]1C2=CC=CC=C2C[C@H]1O"
protein_sequence_result = smiles_to_protein_sequence(smiles)
print("Protein Sequence Result:", protein_sequence_result)


Protein Sequence Result: 


In [28]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Contoh molekul (ubah sesuai dengan kebutuhan)
smiles = 'CC1(C)CCC(CN2CCN(C3=CC(OC4=CNC5=NC=CC5=C4)=C(C(O)=NS(=O)(=O)C4=CC([N+](=O)[O-])=C(NCC5CCOCC5)C=C4)C=C3)CC2)=C(C23CC(C)(C2)C3)C1'

# Membuat objek Molecule dari smiles
mol = Chem.MolFromSmiles(smiles)

# Menambahkan hidrogen ke molekul
mol = Chem.AddHs(mol)

# Menggunakan ETKDGv3 untuk generasi conformer
params = AllChem.ETKDGv3()
params.useSmallRingTorsions = True  # Menggunakan small ring torsions
params.useRandomCoords = True
params.SetCPCI({ (0,3) : 0.9 } )

# Generasi conformer
AllChem.EmbedMultipleConfs(mol, numConfs=3, params=params)

# Menampilkan hasil generasi conformer
for i in range(3):
    print(f"Conformer {i + 1}:")
    for atom in mol.GetConformer(i).GetPositions():
        print(atom)
    print("\n")


Conformer 1:


ValueError: Bad Conformer Id

In [7]:
from rdkit.Chem import AllChem

def generate_conformer(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    mol = Chem.AddHs(mol)  # Add hydrogens for a more accurate 3D structure
    conformer = AllChem.EmbedMolecule(mol, useRandomCoords=True, randomSeed=42)  # Generate a conformer
    return mol


In [8]:
from rdkit import Chem
from rdkit.Chem import rdMolTransforms

def calculate_molecular_center_of_mass(smiles):
    molecule = generate_conformer(smiles)
    try:
        if molecule is None:
            return None

        # Hitung pusat massa molekul
        center_of_mass = rdMolTransforms.ComputeCentroid(molecule.GetConformer())
        total_mass = Descriptors.MolWt(molecule)
        # print(center_of_mass)
        # print(total_mass)
        center_of_mass = sum([center_of_mass[i] * total_mass for i in range(len(center_of_mass))]) / total_mass
        # print(total_mass)
        
        return center_of_mass
    except Exception as e:
        print("Error:", str(e))
        return None


In [9]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_weight(molecule_smiles):
    try:
        mol = generate_conformer(molecule_smiles)
        if mol is None:
            print("Gagal membaca molekul.")
            return None

        # Menghitung massa molekul
        molecular_weight = Descriptors.MolWt(mol)

        return molecular_weight
    except:
        return None

In [10]:
import molecular_scoring

MS = molecular_scoring.ms()

In [11]:
import pandas as pd
import os
import warnings

# Menonaktifkan semua warnings
warnings.filterwarnings('ignore')

# Main Preprocessing

In [41]:
batch_dir = "data_batch_preprocessing2/"
batch_file = [batch_dir+i for i in os.listdir(batch_dir)]

In [13]:
target = pd.read_csv('BindingDB_Target.csv')

In [14]:
target.head()

Unnamed: 0,Target Sequence,Target SMILES,Target Weight,Center Of Mass Target
0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321
1,PQITLWKRPIVTVKIGGQLREALLDTGADDTVLEDINLPGKWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@@H](NC(=O)[C@@...,10738.846,49.458338
2,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C...,57423.509,252.298335
3,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...,42320.463,196.526027
4,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10819.88,49.505189


In [None]:
begin = 0
for j in range(begin,len(batch_file)):
    df = pd.read_csv(batch_file[j])
    for i in range(len(df)):
        target_seq = df['BindingDB Target Chain Sequence'][i]
        target_info = target[target['Target Sequence'] == target_seq].iloc[0] if not target.empty else None

        if target_info is not None:
            df.at[i, 'Target SMILES'] = target_info['Target SMILES']
            df.at[i, 'Target Weight'] = target_info['Target Weight']
            df.at[i, 'Center Of Mass Target'] = target_info['Center Of Mass Target']

    # Pastikan kolom 'Target SMILES', 'Target Weight', dan 'Center Of Mass Target' diisi dengan nilai default jika tidak ditemukan
    df['Target SMILES'].fillna('N/A', inplace=True)
    df['Target Weight'].fillna(0.0, inplace=True)
    df['Center Of Mass Target'].fillna('N/A', inplace=True)

    output_file = f'data_batch_preprocessing2/output2_batch_{j}.csv'
    df.to_csv(output_file, index=False)
    print(output_file)

In [42]:
df = pd.read_csv(batch_file[0])

In [43]:
df.head()

Unnamed: 0,index,Ligand SMILES,Ligand InChI,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),kon (M-1-s-1),koff (s-1),pH,Temp (C),BindingDB Target Chain Sequence,Center Of Mass Ligand,Ligand Weight,Target SMILES,Target Weight,Center Of Mass Target
0,0,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,0.25,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.230112,538.652,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321
1,1,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,0.41,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.200315,486.616,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321
2,2,O=C1N(CCCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,0.8,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,0.388664,480.649,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321
3,3,O=C1N(CCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H]...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,0.99,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.584324,466.622,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321
4,4,CCCCN1C(=O)N(CC2CC2)[C@H](CC2=CC=CC=C2)[C@H](O...,InChI=1S/C27H36N2O3/c1-2-3-16-28-23(17-20-10-6...,1.1,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.591985,436.596,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321


In [51]:
import string

def remove_symbols(input_string, symbols_to_remove):
    """
    Menghapus simbol-simbol yang ditentukan dari input string.

    Parameters:
    input_string (str): String asal.
    symbols_to_remove (str): Simbol-simbol yang ingin dihapus.

    Returns:
    str: String hasil setelah simbol-simbol dihapus.
    """
    # Menggunakan str.maketrans untuk membuat tabel untuk translate
    translation_table = str.maketrans("", "", symbols_to_remove)
    
    # Menghapus simbol-simbol yang diinginkan
    cleaned_string = input_string.translate(translation_table)
    
    return cleaned_string

In [68]:
begin = 0
for j in range(begin,len(batch_file)):
    df = pd.read_csv(batch_file[j])
    df['Ki (nM)'] = [float(remove_symbols(str(df['Ki (nM)'][i]), string.punctuation)) for i in range(len(df))]
    df['Distance'] = [calculate_distance_between_amino_acids(df['Center Of Mass Ligand'][i], df['Center Of Mass Target'][i]) for i in range(len(df))]
    df['Attractive'] = [MS.attractive_energy(df['Distance'][i]) for i in range(len(df))]
    print('8')
    df['Repulsive'] = [MS.repulsive_energy(df['Distance'][i]) for i in range(len(df))]
    print('9')
    df['LJ force(eV/Ã…)'] = [MS.lj_force(df['Distance'][i]) for i in range(len(df))]
    print('10')
    df['Coulomb Energy'] =[MS.coulomb_energy(e, e, df['Distance'][i]) for i in range(len(df))]
    print('11')
    
    output_file = f'data_batch_preprocessing3/output3_batch_{j}.csv'
    df.to_csv(output_file, index=False)
    print(output_file)

8
9
10
11
data_batch_preprocessing3/output3_batch_0.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_1.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_2.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_3.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_4.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_5.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_6.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_7.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_8.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_9.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_10.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_11.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_12.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_13.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_14.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_15.csv
8
9
10
11
data_batch_preprocessing3/output3_batch_16.csv
8
9
10
11
data_batch_preprocessing3/outpu

In [69]:
begin = 0
DF = []
for j in range(begin,len(batch_file)):
    df = pd.read_csv(batch_file[j])
    DF.append(df)

In [70]:
df_res = pd.concat(DF)

In [71]:
df_res = df_res.reset_index()

In [72]:
df_res['Distance'] = [calculate_distance_between_amino_acids(df_res['Center Of Mass Ligand'][i], df_res['Center Of Mass Target'][i]) for i in range(len(df_res))]

In [73]:
df_res.to_csv('BindingDBSmall.csv', index=False)

In [61]:
len(df_res)

67790

In [74]:
df_res.head()

Unnamed: 0,level_0,index,Ligand SMILES,Ligand InChI,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),kon (M-1-s-1),koff (s-1),pH,Temp (C),BindingDB Target Chain Sequence,Center Of Mass Ligand,Ligand Weight,Target SMILES,Target Weight,Center Of Mass Target,Distance
0,0,0,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,0.25,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.230112,538.652,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321,49.762213
1,1,1,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,0.41,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.200315,486.616,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321,49.732416
2,2,2,O=C1N(CCCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,0.8,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,0.388664,480.649,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321,49.143437
3,3,3,O=C1N(CCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H]...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,0.99,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.584324,466.622,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321,50.116425
4,4,4,CCCCN1C(=O)N(CC2CC2)[C@H](CC2=CC=CC=C2)[C@H](O...,InChI=1S/C27H36N2O3/c1-2-3-16-28-23(17-20-10-6...,1.1,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,-0.591985,436.596,CC[C@H](C)[C@H](NC(=O)CNC(=O)CNC(=O)[C@@H](NC(...,10792.854,49.5321,50.124086
