In [None]:
!pip install pandas numpy scikit-learn xgboost rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
pip install "numpy<2"

Collecting numpy<2
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,

# For train

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np

# ---------------------------------------------------
# Adjust these filenames to YOUR environment:
INPUT_CSV = "/content/train.csv"          # or your local path
OUTPUT_CSV = "rdkit_train_features.csv"
# ---------------------------------------------------

# Load your input data
df = pd.read_csv(INPUT_CSV)

# Your list of descriptors
RDKit_features = [
    ('MolWt', Descriptors.MolWt),
    ('HeavyAtomCount', Descriptors.HeavyAtomCount),
    ('TPSA', Descriptors.TPSA),
    ('MolLogP', Descriptors.MolLogP),
    ('NumRotatableBonds', Descriptors.NumRotatableBonds),
    ('FractionCSP3', Descriptors.FractionCSP3),
    ('RingCount', Descriptors.RingCount),
    ('NHOHCount', Descriptors.NHOHCount),
    ('NOCount', Descriptors.NOCount),
    ('MaxPartialCharge', Descriptors.MaxPartialCharge),
    ('MinPartialCharge', Descriptors.MinPartialCharge),
    ('LabuteASA', Descriptors.LabuteASA)
]

# Define feature calculation function
def featurize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [np.nan] * len(RDKit_features)
        values = []
        for (name, func) in RDKit_features:
            try:
                val = func(mol)
            except Exception:
                val = np.nan
            values.append(val)
        return values
    except Exception as e:
        print(f"Error processing SMILES: {smiles}")
        return [np.nan] * len(RDKit_features)

# Apply featurization
feature_matrix = df["SMILES"].apply(featurize_smiles)
feature_df = pd.DataFrame(
    feature_matrix.tolist(),
    columns=[f[0] for f in RDKit_features]
)

# Add ID for merging later in Kaggle
feature_df.insert(0, "id", df["id"])

# Optionally, print how many molecules failed:
n_failed = feature_df.isna().all(axis=1).sum()
print(f"Number of molecules with completely missing features: {n_failed}")

# Save to CSV
feature_df.to_csv(OUTPUT_CSV, index=False)
print("✅ RDKit features saved to:", OUTPUT_CSV)
print(feature_df.head())


Number of molecules with completely missing features: 0
✅ RDKit features saved to: rdkit_train_features.csv
       id     MolWt  HeavyAtomCount    TPSA   MolLogP  NumRotatableBonds  \
0   87817   232.323              17   26.30   3.98170                  8   
1  106919   598.919              45   24.06  12.35960                 16   
2  388772  1003.207              73  122.27  14.21700                 15   
3  519416   542.726              42   24.06  11.00768                  7   
4  539187   965.154              70  182.28  11.84500                 34   

   FractionCSP3  RingCount  NHOHCount  NOCount  MaxPartialCharge  \
0      0.533333          1          0        2               NaN   
1      0.441860          5          2        2               NaN   
2      0.145161         10          0        9               NaN   
3      0.100000          6          2        2               NaN   
4      0.518519          6          0       16               NaN   

   MinPartialCharge   Labu

In [None]:
rdkit_df = pd.read_csv("rdkit_train_features.csv")
print(rdkit_df.head())
print(rdkit_df.isna().sum())
print(len(rdkit_df))

       id     MolWt  HeavyAtomCount    TPSA   MolLogP  NumRotatableBonds  \
0   87817   232.323              17   26.30   3.98170                  8   
1  106919   598.919              45   24.06  12.35960                 16   
2  388772  1003.207              73  122.27  14.21700                 15   
3  519416   542.726              42   24.06  11.00768                  7   
4  539187   965.154              70  182.28  11.84500                 34   

   FractionCSP3  RingCount  NHOHCount  NOCount  MaxPartialCharge  \
0      0.533333          1          0        2               NaN   
1      0.441860          5          2        2               NaN   
2      0.145161         10          0        9               NaN   
3      0.100000          6          2        2               NaN   
4      0.518519          6          0       16               NaN   

   MinPartialCharge   LabuteASA  
0               NaN  103.990949  
1               NaN  273.210536  
2               NaN  426.096572 

In [None]:
import pandas as pd

# Load your previously saved RDKit CSV
rdkit_train_df = pd.read_csv("rdkit_train_features.csv")

print("Original shape:", rdkit_train_df.shape)

Original shape: (7973, 13)


In [None]:
# Drop the partial charge columns that contain NaNs almost everywhere
columns_to_drop = ["MaxPartialCharge", "MinPartialCharge"]

# Keep only columns we want
rdkit_train_df_clean = rdkit_train_df.drop(columns=columns_to_drop)

print("New shape after dropping columns:", rdkit_train_df_clean.shape)

New shape after dropping columns: (7973, 11)


In [None]:
# Check for any other NaNs
print(rdkit_train_df_clean.isna().sum())

id                   0
MolWt                0
HeavyAtomCount       0
TPSA                 0
MolLogP              0
NumRotatableBonds    0
FractionCSP3         0
RingCount            0
NHOHCount            0
NOCount              0
LabuteASA            0
dtype: int64


In [None]:
# Save cleaned file
rdkit_train_df_clean.to_csv("rdkit_train_features_clean.csv", index=False)

print("✅ Clean RDKit training features saved as rdkit_train_features_clean.csv")
print(rdkit_train_df_clean.head())


✅ Clean RDKit training features saved as rdkit_train_features_clean.csv
       id     MolWt  HeavyAtomCount    TPSA   MolLogP  NumRotatableBonds  \
0   87817   232.323              17   26.30   3.98170                  8   
1  106919   598.919              45   24.06  12.35960                 16   
2  388772  1003.207              73  122.27  14.21700                 15   
3  519416   542.726              42   24.06  11.00768                  7   
4  539187   965.154              70  182.28  11.84500                 34   

   FractionCSP3  RingCount  NHOHCount  NOCount   LabuteASA  
0      0.533333          1          0        2  103.990949  
1      0.441860          5          2        2  273.210536  
2      0.145161         10          0        9  426.096572  
3      0.100000          6          2        2  248.856424  
4      0.518519          6          0       16  411.049910  


# For test

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np

# ---------------------------------------------------
# Adjust these filenames to YOUR environment:
INPUT_CSV = "/content/test.csv"          # or your local path
OUTPUT_CSV = "rdkit_test_features.csv"
# ---------------------------------------------------

# Load your input data
df = pd.read_csv(INPUT_CSV)

# Your list of descriptors
RDKit_features = [
    ('MolWt', Descriptors.MolWt),
    ('HeavyAtomCount', Descriptors.HeavyAtomCount),
    ('TPSA', Descriptors.TPSA),
    ('MolLogP', Descriptors.MolLogP),
    ('NumRotatableBonds', Descriptors.NumRotatableBonds),
    ('FractionCSP3', Descriptors.FractionCSP3),
    ('RingCount', Descriptors.RingCount),
    ('NHOHCount', Descriptors.NHOHCount),
    ('NOCount', Descriptors.NOCount),
    ('MaxPartialCharge', Descriptors.MaxPartialCharge),
    ('MinPartialCharge', Descriptors.MinPartialCharge),
    ('LabuteASA', Descriptors.LabuteASA)
]

# Define feature calculation function
def featurize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [np.nan] * len(RDKit_features)
        values = []
        for (name, func) in RDKit_features:
            try:
                val = func(mol)
            except Exception:
                val = np.nan
            values.append(val)
        return values
    except Exception as e:
        print(f"Error processing SMILES: {smiles}")
        return [np.nan] * len(RDKit_features)

# Apply featurization
feature_matrix = df["SMILES"].apply(featurize_smiles)
feature_df = pd.DataFrame(
    feature_matrix.tolist(),
    columns=[f[0] for f in RDKit_features]
)

# Add ID for merging later in Kaggle
feature_df.insert(0, "id", df["id"])

# Optionally, print how many molecules failed:
n_failed = feature_df.isna().all(axis=1).sum()
print(f"Number of molecules with completely missing features: {n_failed}")

# Save to CSV
feature_df.to_csv(OUTPUT_CSV, index=False)
print("✅ RDKit features saved to:", OUTPUT_CSV)
print(feature_df.head())


Number of molecules with completely missing features: 0
✅ RDKit features saved to: rdkit_test_features.csv
           id    MolWt  HeavyAtomCount   TPSA  MolLogP  NumRotatableBonds  \
0  1109053969  540.463              39  43.18   7.3603                  8   
1  1422188626  510.589              39  52.60   7.2845                  9   
2  2032016830  586.644              44  93.22   6.1875                 13   

   FractionCSP3  RingCount  NHOHCount  NOCount  MaxPartialCharge  \
0      0.103448          4          0        4               NaN   
1      0.085714          5          0        4               NaN   
2      0.222222          6          0        8               NaN   

   MinPartialCharge   LabuteASA  
0               NaN  219.089169  
1               NaN  227.715314  
2               NaN  255.153204  


In [None]:
rdkit_df = pd.read_csv("rdkit_test_features.csv")
print(rdkit_df.head())
print(rdkit_df.isna().sum())
print(len(rdkit_df))

           id    MolWt  HeavyAtomCount   TPSA  MolLogP  NumRotatableBonds  \
0  1109053969  540.463              39  43.18   7.3603                  8   
1  1422188626  510.589              39  52.60   7.2845                  9   
2  2032016830  586.644              44  93.22   6.1875                 13   

   FractionCSP3  RingCount  NHOHCount  NOCount  MaxPartialCharge  \
0      0.103448          4          0        4               NaN   
1      0.085714          5          0        4               NaN   
2      0.222222          6          0        8               NaN   

   MinPartialCharge   LabuteASA  
0               NaN  219.089169  
1               NaN  227.715314  
2               NaN  255.153204  
id                   0
MolWt                0
HeavyAtomCount       0
TPSA                 0
MolLogP              0
NumRotatableBonds    0
FractionCSP3         0
RingCount            0
NHOHCount            0
NOCount              0
MaxPartialCharge     3
MinPartialCharge     3
Labute

In [None]:
import pandas as pd

# Load your previously saved RDKit CSV
rdkit_train_df = pd.read_csv("rdkit_test_features.csv")

print("Original shape:", rdkit_train_df.shape)

Original shape: (3, 13)


In [None]:
# Drop the partial charge columns that contain NaNs almost everywhere
columns_to_drop = ["MaxPartialCharge", "MinPartialCharge"]

# Keep only columns we want
rdkit_train_df_clean = rdkit_train_df.drop(columns=columns_to_drop)

print("New shape after dropping columns:", rdkit_train_df_clean.shape)

New shape after dropping columns: (3, 11)


In [None]:
# Check for any other NaNs
print(rdkit_train_df_clean.isna().sum())

id                   0
MolWt                0
HeavyAtomCount       0
TPSA                 0
MolLogP              0
NumRotatableBonds    0
FractionCSP3         0
RingCount            0
NHOHCount            0
NOCount              0
LabuteASA            0
dtype: int64


In [None]:
# Save cleaned file
rdkit_train_df_clean.to_csv("rdkit_test_features_clean.csv", index=False)

print("✅ Clean RDKit training features saved as rdkit_train_features_clean.csv")
print(rdkit_train_df_clean.head())


✅ Clean RDKit training features saved as rdkit_train_features_clean.csv
           id    MolWt  HeavyAtomCount   TPSA  MolLogP  NumRotatableBonds  \
0  1109053969  540.463              39  43.18   7.3603                  8   
1  1422188626  510.589              39  52.60   7.2845                  9   
2  2032016830  586.644              44  93.22   6.1875                 13   

   FractionCSP3  RingCount  NHOHCount  NOCount   LabuteASA  
0      0.103448          4          0        4  219.089169  
1      0.085714          5          0        4  227.715314  
2      0.222222          6          0        8  255.153204  
