### Selecting last two columns

In [5]:
import pandas as pd

# Load the TSV file
# df = pd.read_csv('./Combined_data/cathepsin_B.tsv', sep='\t')
# df = pd.read_csv('./Combined_data/cathepsin_S.tsv', sep='\t')
# df = pd.read_csv('./Combined_data/cathepsin_D.tsv', sep='\t')
df = pd.read_csv('./Combined_data/cathepsin_K.tsv', sep='\t')


# Select the last two columns
last_two_columns = df.iloc[:, -2:]  # Select columns using index slicing

# Save the selected columns to a new TSV file
# last_two_columns.to_csv('./Intermediate_data/last_two_columns_B.tsv', sep='\t', index=False)
# last_two_columns.to_csv('./Intermediate_data/last_two_columns_S.tsv', sep='\t', index=False)
# last_two_columns.to_csv('./Intermediate_data/last_two_columns_D.tsv', sep='\t', index=False)
last_two_columns.to_csv('./Intermediate_data/last_two_columns_K.tsv', sep='\t', index=False)


print("The last two columns have been saved to 'last_two_columns.tsv'.")


The last two columns have been saved to 'last_two_columns.tsv'.


In [9]:
import pandas as pd

# Load the TSV file
# df = pd.read_csv('./Intermediate_data/last_two_columns_B.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_S.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_D.tsv', sep='\t')
df = pd.read_csv('./Intermediate_data/last_two_columns_K.tsv', sep='\t')


# Select the last two columns
last_two_columns = df.iloc[:, -2:]  # Select columns using index slicing

# Remove rows with any empty (NaN) values
cleaned_data = last_two_columns.dropna()

# Save the cleaned data to a new TSV file
# cleaned_data.to_csv('./Intermediate_data/last_two_columns_B.tsv', sep='\t', index=False)
# cleaned_data.to_csv('./Intermediate_data/last_two_columns_S.tsv', sep='\t', index=False)
# cleaned_data.to_csv('./Intermediate_data/last_two_columns_D.tsv', sep='\t', index=False)
cleaned_data.to_csv('./Intermediate_data/last_two_columns_K.tsv', sep='\t', index=False)


print("The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns.tsv'.")


The cleaned data (last two columns without empty rows) has been saved to 'cleaned_last_two_columns.tsv'.


### Generating descriptors

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# Load the cleaned dataset
# df = pd.read_csv('./Intermediate_data/last_two_columns_B.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_S.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_D.tsv', sep='\t')
df = pd.read_csv('./Intermediate_data/last_two_columns_K.tsv', sep='\t')


# Function to calculate descriptors for a SMILES string
def get_descriptors(smiles):
    # Convert SMILES to molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors._descList)  # Return None if SMILES is invalid
    
    # Generate descriptors
    descriptors = [desc[1](mol) for desc in Descriptors._descList]
    return descriptors

# Generate descriptors for each SMILES string in the first column
smiles_list = df.iloc[:, 0]  # Assuming the first column contains SMILES strings
descriptor_data = [get_descriptors(smiles) for smiles in smiles_list]

# Create a DataFrame with descriptors
descriptor_columns = [desc[0] for desc in Descriptors._descList]
descriptor_df = pd.DataFrame(descriptor_data, columns=descriptor_columns)

# Save the descriptors to a CSV file
# descriptor_df.to_csv('./Training_data/input_cathepsin_B.csv', index=False)
# descriptor_df.to_csv('./Training_data/input_cathepsin_S.csv', index=False)
# descriptor_df.to_csv('./Training_data/input_cathepsin_D.csv', index=False)
descriptor_df.to_csv('./Training_data/input_cathepsin_K.csv', index=False)


print("Molecular descriptors have been saved to 'molecular_descriptors.csv'.")


[22:19:48] Explicit valence for atom # 3 N, 4, is greater than permitted


Molecular descriptors have been saved to 'molecular_descriptors.csv'.


### output file

In [17]:
import pandas as pd

# Load the file that has the two columns
# df = pd.read_csv('./Intermediate_data/last_two_columns_B.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_S.tsv', sep='\t')
# df = pd.read_csv('./Intermediate_data/last_two_columns_D.tsv', sep='\t')
df = pd.read_csv('./Intermediate_data/last_two_columns_K.tsv', sep='\t')


# Select the last column
last_column = df.iloc[:, -1]  # This selects the last column

# Save the last column to a new file
# last_column.to_csv('./Training_data/output_cathepsin_B.csv', index=False, header=True)
# last_column.to_csv('./Training_data/output_cathepsin_S.csv', index=False, header=True)
# last_column.to_csv('./Training_data/output_cathepsin_D.csv', index=False, header=True)
last_column.to_csv('./Training_data/output_cathepsin_K.csv', index=False, header=True)


print("The last column has been saved to 'last_column_output.csv'.")


The last column has been saved to 'last_column_output.csv'.
