# Preprocessing Function

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from scipy.constants import e

def calculate_amino_acid_center_of_mass(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            try:
                amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSequence(aa)))
            except:
                return 0
                break

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_amino_acid_center_of_mass_smiles(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSmiles(aa)))

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_distance_between_amino_acids(aa1, aa2):
    # Menghitung jarak antara dua pusat massa asam amino
    distance = abs(aa1 - aa2)
    return distance

In [2]:
from rdkit import Chem
from collections import Counter

def get_bond_types(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    bond_types = []
    for bond in mol.GetBonds():
        bond_type = bond.GetBondTypeAsDouble()
        bond_types.append(bond_type)

    # Menghitung frekuensi tipe ikatan
    bond_type_counts = Counter(bond_types)

    return bond_type_counts

In [3]:
from rdkit import Chem
from collections import Counter

def count_atoms(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    # Menghitung jumlah atom-atom
    atom_counts = Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

    return atom_counts

In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_weight(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    # Menghitung massa molekul
    molecular_weight = Descriptors.MolWt(mol)

    return molecular_weight

In [5]:
def seq_to_smiles(seq):
    try:
        mol = Chem.MolFromSequence(seq)
        smiles = Chem.MolToSmiles(mol,kekuleSmiles=True)
        return str(smiles)
    except:
        return None

def inchi_to_smiles(inchi):
    try:
        molecule = Chem.MolFromInchi(inchi)
        smiles = Chem.MolToSmiles(molecule,kekuleSmiles=True)
        return smiles
    except:
        return None

In [6]:
import molecular_scoring

MS = molecular_scoring.ms()

# Preprocessing

In [8]:
import pandas as pd

df = pd.read_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv')

  df = pd.read_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv')


In [9]:
df.head()

Unnamed: 0,Ligand SMILES,Ligand InChI,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),kon (M-1-s-1),koff (s-1),pH,Temp (C),BindingDB Target Chain Sequence
0,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,0.25,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
1,O=C1N(C/C=C/C2=CNN=C2)[C@H](CC2=CC=CC=C2)[C@H]...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,0.41,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
2,O=C1N(CCCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,0.8,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
3,O=C1N(CCCCCO)[C@H](CC2=CC=CC=C2)[C@H](O)[C@@H]...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,0.99,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
4,CCCCN1C(=O)N(CC2CC2)[C@H](CC2=CC=CC=C2)[C@H](O...,InChI=1S/C27H36N2O3/c1-2-3-16-28-23(17-20-10-6...,1.1,,,,,,5.5,37.00 C,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...


In [30]:
df['Ligand SMILES'].isna().sum()

110299

In [31]:
len(df)

2772868

In [None]:
# df['Ligand SMILES'] = [inchi_to_smiles(df['Ligand InChI'][i]) for i in range(len(df))]
# df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [10]:
df = df.dropna(subset='Ligand SMILES')
df = df.reset_index()

In [27]:
Chem.Descriptors.MolWt(Chem.MolFromSmiles(df['Ligand SMILES'][0]))

538.6520000000002

In [39]:
# center_of_mass_target = []
# for i in range(len(df)):
#     center_of_mass_target.append(calculate_amino_acid_center_of_mass(str(df['BindingDB Target Chain Sequence'][i])))

In [11]:
df['Center Of Mass Target'] = [calculate_amino_acid_center_of_mass(str(df['BindingDB Target Chain Sequence'][i])) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Center Of Mass Ligand'] = [calculate_amino_acid_center_of_mass_smiles(str(df['Ligand SMILES'][i])) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Distance'] = [calculate_distance_between_amino_acids(df['Center Of Mass Ligand'][i], df['Center Of Mass Target'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Ligand Weight'] = [calculate_molecular_weight(str(df['Ligand SMILES'][i])) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Target SMILES'] = [seq_to_smiles(df['BindingDB Target Chain Sequence'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Target Weight'] = [calculate_molecular_weight(str(df['Target SMILES'][i])) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Attractive'] = [MS.attractive_energy(df['Distance'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Repulsive'] = [MS.repulsive_energy(df['Distance'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['Coulomb Energy'] = [MS.coulomb_energy(e, e, df['Distance'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)

In [None]:
df['LJ force(eV/Å)'] = [MS.lj_force(df['Distance'][i]) for i in range(len(df))]
df.to_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv', index=False)