# Feature Engineering

## 1. Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.features.molecular import MolecularFeaturizer

## 2. Load Data

In [2]:
TRAIN_PATH = '../data/raw/train.csv'
TEST_PATH = '../data/raw/test.csv'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')

Train shape: (2662, 427)
Test shape: (666, 426)


## 3. Generate Molecular Descriptors
Generating standard physicochemical descriptors from RDKit.

In [3]:
featurizer = MolecularFeaturizer(smiles_col='SMILES')

print("Generating descriptors for Train set...")
train_desc = featurizer.generate_descriptors(train_df)

print("Generating descriptors for Test set...")
test_desc = featurizer.generate_descriptors(test_df)

print(f'Train shape after descriptors: {train_desc.shape}')
print(f'Test shape after descriptors: {test_desc.shape}')

2026-01-12 06:46:39,016 - src.features.molecular - INFO - Generating 217 RDKit descriptors for 2662 molecules...


Generating descriptors for Train set...


2026-01-12 06:46:48,293 - src.features.molecular - INFO - Generating 217 RDKit descriptors for 666 molecules...


Generating descriptors for Test set...
Train shape after descriptors: (2662, 644)
Test shape after descriptors: (666, 643)


## 4. Generate Fingerprints
Generating Morgan fingerprints (radius=2, bits=2048) and MACCS keys.

In [4]:
# Generate Morgan Fingerprints
print("Generating Morgan fingerprints...")
train_morgan = featurizer.generate_morgan_fingerprints(train_desc)
test_morgan = featurizer.generate_morgan_fingerprints(test_desc)

# Generate MACCS Keys
print("Generating MACCS keys...")
train_full = featurizer.generate_maccs_keys(train_morgan)
test_full = featurizer.generate_maccs_keys(test_morgan)

print(f'Final Train shape: {train_full.shape}')
print(f'Final Test shape: {test_full.shape}')

2026-01-12 06:46:50,595 - src.features.molecular - INFO - Generating Morgan fingerprints (r=2, bits=2048)...


Generating Morgan fingerprints...


2026-01-12 06:46:51,115 - src.features.molecular - INFO - Generating Morgan fingerprints (r=2, bits=2048)...
2026-01-12 06:46:51,236 - src.features.molecular - INFO - Generating MACCS keys...


Generating MACCS keys...


2026-01-12 06:46:52,162 - src.features.molecular - INFO - Generating MACCS keys...


Final Train shape: (2662, 2859)
Final Test shape: (666, 2858)


## 5. Clean and Save
Checking for NaNs created during feature generation and saving to processed data.

In [5]:
# Check for NaNs in new features
print("NaNs in Train:", train_full.isna().sum().sum())
print("NaNs in Test:", test_full.isna().sum().sum())

# Save
train_full.to_csv('../data/processed/train_featurized.csv', index=False)
test_full.to_csv('../data/processed/test_featurized.csv', index=False)

print("Saved processed datasets to data/processed/")

NaNs in Train: 0
NaNs in Test: 0
Saved processed datasets to data/processed/
