In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score

plt.rcParams["figure.figsize"] = (6, 4)

print("Imports OK")


Imports OK


In [35]:
co2_smiles = pd.read_csv("../data/processed/co2_xyz_smiles_obabelonly.csv")
print("Total CO2 rows:", len(co2_smiles))
co2_smiles.head()

energy_df = pd.read_csv("../data/raw/graphs_csv/CO2.csv")
energy_df.head()

Total CO2 rows: 132


Unnamed: 0,FLPID,OLD ID,LA-LB distance (A),CO2,E_CO2,H_CO2,G_CO2,E_CO2_sol,H_CO2_sol,G_CO2_sol
0,0BP06011,1,6.247,CO2,-4.9,-4.0,4.9,-4.8,-3.9,5.1
1,0BP07011,2,2.217,CO2,-5.6,-4.3,8.1,-9.7,-8.4,4.0
2,0BN08011,3,3.518,CO2,-7.3,-5.1,9.0,-11.3,-9.1,4.9
3,0BP08011,4,1.807,CO2,-12.7,-11.6,0.9,-15.5,-14.4,-1.9
4,0BP08012,5,1.79,CO2,-14.4,-13.5,0.0,-16.9,-16.0,-2.5


In [41]:
energy_df["FLPID"] = energy_df["FLPID"].astype(str).str.strip()
flpid_list = energy_df["FLPID"].unique().tolist()

print("Example FLPIDs:", flpid_list)

Example FLPIDs: ['0BP06011', '0BP07011', '0BN08011', '0BP08011', '0BP08012', '0BN03011', '0BN12011', '0BN17011', '0BN17012', '0BN17013', '0BN17014', '0BN17015', '0BN17016', '0BN17017', '0BP13011', '0BP12011', '0BP16011', '0BN10011', '0BP15031', '0BP15032', '0BP16031', '0BP16032', '0BN16011', '0BN16012', '0BP09011', '0BP09012', '0BP09013', '0BP09014', '0BP16021', '0BP16022', '0BP16023', '0BN11031', '0BN14011', '0BN14012', '0BN14013', '0BN14014', '0BN14015', '0BN14016', '0BN15021', '0PN12011', '0AN13011', '0BN18021', '0BP14011', '0BP13021', '0BP13022', '0BP17011', '0BP20031', '0BP20041', '0BP16051', '0BP19011', '0BN18011', '0BN16021', '0BP17021', '0BN15011', '0BN10021', '0BN19011', '0BP09021', '0BN09011', '0BN11021', '0BN11022', '0BP19021', '0BP18011', '0BP20051', '0BP20061', '0BP20011', '0AP20011', '0?P15011', '0?P19011', '0?P19012', '0BP19031', '0BP16061', '0BN11011', '0BN11012', '0BN11013', '0BN11014', '0BN11015', '0BP15011', '0BP15021', '0BP15022', '0BP15023', '0BP19041', '0BP19051',

In [42]:
def extract_old_id_from_xyz(path_str: str) -> int | None:
    """
    Example: 'data/raw/xyz/17/17CO2.xyz' -> 17
    Takes the filename and pulls the first integer.
    """
    name = Path(path_str).name  # e.g. '17CO2.xyz'
    m = re.search(r"\d+", name)
    return int(m.group()) if m else None

co2_smiles["OLD_ID_num"] = co2_smiles["xyz_path"].astype(str).apply(extract_old_id_from_xyz)
co2_smiles[["xyz_path", "OLD_ID_num"]].head()
print("Unique OLD_ID_num:", sorted(co2_smiles["OLD_ID_num"].unique()))

Unique OLD_ID_num: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 91, 94, 95, 96, 97, 98, 99, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 120, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157]


In [43]:
def extract_old_id_from_energy(x) -> int | None:
    """
    Example: '1', '01', '001' -> 1
    """
    m = re.search(r"\d+", str(x))
    return int(m.group()) if m else None

energy_df["OLD_ID_num"] = energy_df["OLD ID"].apply(extract_old_id_from_energy)
energy_df[["OLD ID", "OLD_ID_num"]].head()
print("Unique OLD_ID_num in energy_df:", sorted(energy_df["OLD_ID_num"].unique()))

Unique OLD_ID_num in energy_df: [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 91, 94, 95, 96, 97, 98, 99, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 120, 122, 123, 124, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157]


In [45]:
co2_master = co2_smiles.merge(
    energy_df,
    on="OLD_ID_num",   # same column name in both
    how="inner",
)

print("co2_master shape:", co2_master.shape)
co2_master[["FLPID", "xyz_path", "OLD_ID_num", "OLD ID"]].head()


co2_master shape: (131, 15)


Unnamed: 0,FLPID,xyz_path,OLD_ID_num,OLD ID
0,0BP06011,data\raw\xyz\1\1CO2.xyz,1,1
1,0BN17012,data\raw\xyz\10\10CO2.xyz,10,10
2,0BP11011,data\raw\xyz\103\103CO2.xyz,103,103
3,0BP11012,data\raw\xyz\104\104CO2.xyz,104,104
4,0BP13031,data\raw\xyz\105\105CO2.xyz,105,105


In [53]:
# Dataset building 

TARGET_COL = "E_CO2"  # target energy

dataset = co2_master[["FLPID", "xyz_path", "smiles", TARGET_COL]].copy()
dataset = dataset.dropna(subset=["smiles", TARGET_COL])

print("Final dataset shape:", dataset.shape)
dataset.head()



Final dataset shape: (131, 4)


Unnamed: 0,FLPID,xyz_path,smiles,E_CO2
0,0BP06011,data\raw\xyz\1\1CO2.xyz,c1(c(c(c(c(c1P(c1c(cc(cc1C)C)C)c1c(cc(cc1C)C)C...,-4.9
1,0BN17012,data\raw\xyz\10\10CO2.xyz,c12ccccc1[N]1(C(CCCC1(C)C)(C)C)C(=O)O[BH2]2,8.7
2,0BP11011,data\raw\xyz\103\103CO2.xyz,P1(c2cccc3cccc(B(c4c(cc(cc4C)C)C)OC1=O)c23)c1c...,-9.4
3,0BP11012,data\raw\xyz\104\104CO2.xyz,P1(c2cccc3cccc(B(c4c(cccc4C)C)OC1=O)c23)c1c(cc...,-8.9
4,0BP13031,data\raw\xyz\105\105CO2.xyz,[P]1(c2c3c([B](c4c(cc(cc4C)C)C)(c4c(cc(cc4C)C)...,12.1


In [54]:
# SMILES to Morgan fingerprints

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score

def smiles_to_morgan(smiles: str, radius: int = 2, n_bits: int = 2048) -> np.ndarray:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    for i in range(n_bits):
        arr[i] = fp.GetBit(i)
    return arr

print("dataset shape before fingerprints:", dataset.shape)

X = np.vstack(dataset["smiles"].apply(smiles_to_morgan).to_list())
y = dataset[TARGET_COL].to_numpy()

print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)
print("Ridge MAE:", mean_absolute_error(y_test, y_pred))
print("Ridge R² :", r2_score(y_test, y_pred))



dataset shape before fingerprints: (131, 4)
X shape: (131, 2048)
y shape: (131,)
Ridge MAE: 10.465342662657955
Ridge R² : 0.011664112106616553
