# Preprocessing Function

In [9]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from scipy.constants import e

def calculate_amino_acid_center_of_mass(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            try:
                amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSequence(aa)))
            except:
                return 0
                break

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_amino_acid_center_of_mass_smiles(sequence):
    try:
        amino_acid_masses = []
        for aa in sequence:
            amino_acid_masses.append(Chem.Descriptors.MolWt(Chem.MolFromSmiles(aa)))

        # Hitung pusat massa asam amino
        total_mass = sum(amino_acid_masses)
        center_of_mass = sum(i * mass for i, mass in enumerate(amino_acid_masses, start=1)) / total_mass

        return center_of_mass
    except:
        return 0

def calculate_distance_between_amino_acids(aa1, aa2):
    # Menghitung jarak antara dua pusat massa asam amino
    distance = abs(aa1 - aa2)
    return distance

In [10]:
from rdkit import Chem
from collections import Counter

def get_bond_types(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    bond_types = []
    for bond in mol.GetBonds():
        bond_type = bond.GetBondTypeAsDouble()
        bond_types.append(bond_type)

    # Menghitung frekuensi tipe ikatan
    bond_type_counts = Counter(bond_types)

    return bond_type_counts

In [11]:
from rdkit import Chem
from collections import Counter

def count_atoms(molecule_smiles):
    mol = Chem.MolFromSmiles(molecule_smiles)
    if mol is None:
        print("Gagal membaca molekul.")
        return None

    # Menghitung jumlah atom-atom
    atom_counts = Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

    return atom_counts

In [12]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_weight(molecule_smiles):
    try:
        mol = Chem.MolFromSmiles(molecule_smiles)
        if mol is None:
            print("Gagal membaca molekul.")
            return None

        # Menghitung massa molekul
        molecular_weight = Descriptors.MolWt(mol)

        return molecular_weight
    except:
        return None

In [13]:
def seq_to_smiles(seq):
    try:
        mol = Chem.MolFromSequence(seq)
        smiles = Chem.MolToSmiles(mol,kekuleSmiles=True)
        return str(smiles)
    except:
        return None

def inchi_to_smiles(inchi):
    try:
        molecule = Chem.MolFromInchi(inchi)
        smiles = Chem.MolToSmiles(molecule,kekuleSmiles=True)
        return smiles
    except:
        return None

In [14]:
import molecular_scoring

MS = molecular_scoring.ms()

# Preprocessing Data

In [15]:
import pandas as pd

In [16]:
# Load the data
df = pd.read_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv').dropna(subset=['Ligand SMILES']).reset_index()
# df['Ligand SMILES'] = ""
# df['Center Of Mass Target'] = 0.0
# df['Center Of Mass Ligand'] = 0.0
# df['Distance'] = 0.0
# df['Ligand Weight'] = 0.0
# df['Target Weight'] = 0.0
# df['Attractive'] = 0.0
# df['Repulsive'] = 0.0
# df['Coulomb Energy'] = 0.0
# df['LJ force(eV/Å)'] = 0.0

  df = pd.read_csv('BindingDB_ALL_LigandTarget_Ki_fixed.csv').dropna(subset=['Ligand SMILES']).reset_index()


In [None]:
interval = 100
start = 0

# Calculate properties and add columns
for i in range(start*interval, len(df), interval):
    end_idx = min(i + interval, len(df))
    batch_df = df[i:end_idx]

    batch_df['Ligand SMILES'] = ""
    batch_df['Center Of Mass Target'] = 0.0
    batch_df['Center Of Mass Ligand'] = 0.0
    batch_df['Distance'] = 0.0
    batch_df['Ligand Weight'] = 0.0
    batch_df['Target Weight'] = 0.0
    batch_df['Attractive'] = 0.0
    batch_df['Repulsive'] = 0.0
    batch_df['Coulomb Energy'] = 0.0
    batch_df['LJ force(eV/Å)'] = 0.0


    batch_df['Ligand SMILES'] = batch_df['Ligand InChI'].apply(inchi_to_smiles)
    batch_df['Center Of Mass Target'] = batch_df['BindingDB Target Chain Sequence'].apply(calculate_amino_acid_center_of_mass)
    batch_df['Center Of Mass Ligand'] = batch_df['Ligand SMILES'].apply(calculate_amino_acid_center_of_mass_smiles)
    batch_df['Distance'] = batch_df.apply(lambda row: calculate_distance_between_amino_acids(row['Center Of Mass Ligand'], row['Center Of Mass Target']), axis=1)
    batch_df['Ligand Weight'] = batch_df['Ligand SMILES'].apply(calculate_molecular_weight)
    batch_df['Target SMILES'] = batch_df['BindingDB Target Chain Sequence'].apply(seq_to_smiles)
    batch_df['Target Weight'] = batch_df['Target SMILES'].apply(calculate_molecular_weight)
    batch_df['Attractive'] = batch_df['Distance'].apply(MS.attractive_energy)
    batch_df['Repulsive'] = batch_df['Distance'].apply(MS.repulsive_energy)
    batch_df['LJ force(eV/Å)'] = batch_df['Distance'].apply(MS.lj_force)
    batch_df['Coulomb Energy'] = batch_df['Distance'].apply(lambda dist: MS.coulomb_energy(e, e, dist))

    output_file = f'batch_preprocessing/output_batch_{i // interval}.csv'
    batch_df.to_csv(output_file, index=False)

    print(f'Processed batch {i // interval}')

print('Processing completed.')
