## Generating training dataset

In [10]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# List of cathepsins
cathepsins = ["B", "S", "D", "K"]

### Selecting last two columns

In [11]:
for cat in cathepsins:
    df = pd.read_csv(f'./Combined_data/cathepsin_{cat}.tsv', sep='\t')
    last_two_columns = df.iloc[:, -2:]  # Select columns using index slicing
    last_two_columns.to_csv(f'./Intermediate_data/last_two_columns_{cat}.tsv', sep='\t', index=False)
    print(f"The last two columns have been saved to 'last_two_columns_{cat}.tsv'.")


The last two columns have been saved to 'last_two_columns_B.tsv'.
The last two columns have been saved to 'last_two_columns_S.tsv'.
The last two columns have been saved to 'last_two_columns_D.tsv'.
The last two columns have been saved to 'last_two_columns_K.tsv'.


### Removing nan (if any)

In [12]:
for cat in cathepsins:
    df = pd.read_csv(f'./Intermediate_data/last_two_columns_{cat}.tsv', sep='\t')


    # Select the last two columns
    last_two_columns = df.iloc[:, -2:]  # Select columns using index slicing

    # Remove rows with any empty (NaN) values
    cleaned_data = last_two_columns.dropna()

    cleaned_data.to_csv(f'./Intermediate_data/last_two_columns_{cat}.tsv', sep='\t', index=False)


    print(f"The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns_{cat}.tsv'.")


The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns_B.tsv'.
The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns_S.tsv'.
The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns_D.tsv'.
The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns_K.tsv'.


### Generating descriptors

In [13]:
def get_descriptors(smiles):
    # Convert SMILES to molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors._descList)  # Return None if SMILES is invalid
        
    # Generate descriptors
    descriptors = [desc[1](mol) for desc in Descriptors._descList]
    return descriptors


for cat in cathepsins:
    # Load the cleaned dataset
    df = pd.read_csv(f'./Intermediate_data/last_two_columns_{cat}.tsv', sep='\t')

    # Generate descriptors for each SMILES string in the first column
    smiles_list = df.iloc[:, 0]  # Assuming the first column contains SMILES strings
    descriptor_data = [get_descriptors(smiles) for smiles in smiles_list]

    # Create a DataFrame with descriptors
    descriptor_columns = [desc[0] for desc in Descriptors._descList]
    descriptor_df = pd.DataFrame(descriptor_data, columns=descriptor_columns)

    # Save the descriptors to a CSV file
    descriptor_df.to_csv(f'./Training_data/input_cathepsin_{cat}.csv', index=False)


    print(f"Molecular descriptors have been saved to './Training_data/input_cathepsin_{cat}.csv'.")


Molecular descriptors have been saved to './Training_data/input_cathepsin_B.csv'.
Molecular descriptors have been saved to './Training_data/input_cathepsin_S.csv'.
Molecular descriptors have been saved to './Training_data/input_cathepsin_D.csv'.


[17:25:15] Explicit valence for atom # 3 N, 4, is greater than permitted


Molecular descriptors have been saved to './Training_data/input_cathepsin_K.csv'.


### output file

In [14]:

for cat in cathepsins:
    df = pd.read_csv(f'./Intermediate_data/last_two_columns_{cat}.tsv', sep='\t')


    # Select the last column
    last_column = df.iloc[:, -1]  # This selects the last column

    last_column.to_csv(f'./Training_data/output_cathepsin_{cat}.csv', index=False, header=True)


    print(f"The last column has been saved to './Training_data/output_cathepsin_{cat}.csv'.")


The last column has been saved to './Training_data/output_cathepsin_B.csv'.
The last column has been saved to './Training_data/output_cathepsin_S.csv'.
The last column has been saved to './Training_data/output_cathepsin_D.csv'.
The last column has been saved to './Training_data/output_cathepsin_K.csv'.
