### Project python Code

In [10]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

df=pd.read_csv("mtbs_tropical_annotations.tsv", sep="\t")

#take the interested columns and make a new dataframe cleaned
df_cleaned= df_small = df[[
    "structure_smiles",
    "structure_taxonomy_npclassifier_01pathway",
    "structure_taxonomy_npclassifier_02superclass",
    "structure_taxonomy_npclassifier_03class"
]]


#remove rows with missing structure_smiles
df_cleaned = df.dropna(subset=["structure_smiles"], axis=0)

#check for duplicated smiles
df_cleaned.duplicated(subset=["structure_smiles"]).sum()

#collapse duplicate smiles by taking the most common class, subclass, my_class
def take_most_common(series: pd.Series):
    s = series.dropna()
    if s.empty:
        return np.nan
    return s.value_counts().idxmax()

#rename columns for easier handling
df_cleaned = df_cleaned.rename(columns={
    "structure_smiles": "smiles",
    "structure_taxonomy_npclassifier_01pathway": "class",
    "structure_taxonomy_npclassifier_02superclass": "subclass",
    "structure_taxonomy_npclassifier_03class": "my_class"
})

#collapse duplicates
df_collapsed = (
    df_cleaned
    .groupby("smiles", as_index=False)[["class", "subclass", "my_class"]]
    .agg(take_most_common)
)

#now I give an id to each smiles as in R code 
df_collapsed["SID"] = ["S" + str(i+1) for i in range(len(df_collapsed))]
df_collapsed = df_collapsed[["SID", "smiles", "class", "subclass", "my_class"]]


#calculate chemical descriptors
def calc_desc(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return {
        "MolWt": Descriptors.MolWt(mol),
        "LogP": Descriptors.MolLogP(mol),
        "TPSA": Descriptors.TPSA(mol),
        "HBD": Descriptors.NumHDonors(mol),
        "HBA": Descriptors.NumHAcceptors(mol),
        "RingCount": Descriptors.RingCount(mol),
        "FractionCSP3": Descriptors.FractionCSP3(mol)
    }

desc_list = df_collapsed["smiles"].apply(calc_desc)
desc_df = pd.DataFrame(desc_list.tolist())
df_final = pd.concat([df_collapsed, desc_df], axis=1)




In [11]:
df_final

Unnamed: 0,SID,smiles,class,subclass,my_class,MolWt,LogP,TPSA,HBD,HBA,RingCount,FractionCSP3
0,S1,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids,Fatty acyls,Fatty alcohols,292.466,5.8258,0.00,0,0,0,0.454545
1,S2,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids,Fatty acyls|Fatty Acids and Conjugates,Fatty alcohols|Unsaturated fatty acids,282.471,5.8837,0.00,0,0,0,0.619048
2,S3,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids,Fatty acyls,Fatty alcohols,308.465,4.4694,20.23,1,1,0,0.545455
3,S4,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids,Lysine alkaloids,Piperidine alkaloids,297.442,4.0885,20.31,0,1,1,0.550000
4,S5,C#CC#CC=CC=CC=CCCO,Fatty acids,Fatty Acids and Conjugates,Unsaturated fatty acids,172.227,1.6740,20.23,1,1,0,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...
24041,S24042,c1ccc2c(c1)oc1ccccc12,Shikimates and Phenylpropanoids,Coumarins,Simple coumarins,168.195,3.5860,13.14,0,1,3,0.000000
24042,S24043,c1ccc2c3c([nH]c2c1)[C@@H]1CCN2CCC[C@@H]2N1CC3,Alkaloids,Tryptophan alkaloids,Carboline alkaloids,267.376,2.8927,22.27,1,2,5,0.529412
24043,S24044,c1cncc(-c2ncc(-c3ccc4c(c3)OCO4)o2)c1,Alkaloids,Tyrosine alkaloids|Nicotinic acid alkaloids,Oxazole alkaloids|Pyridine alkaloids,266.256,3.1323,57.38,0,5,4,0.066667
24044,S24045,c1cncc(C2=NCCC2)c1,Alkaloids,Nicotinic acid alkaloids,Pyridine alkaloids,146.193,1.6645,25.25,0,2,2,0.333333
