In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

plt.rcParams["figure.figsize"] = (6, 4)

print("Imports OK")

Imports OK


In [7]:
smiles_df = pd.read_csv("../data/processed/xyz_smiles_map.csv")
print(smiles_df.shape)
smiles_df


(927, 3)


Unnamed: 0,xyz_path,smiles,error
0,data\raw\xyz\1\1.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
1,data\raw\xyz\1\1CH3OH.xyz,CO.Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c...,
2,data\raw\xyz\1\1CO2.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
3,data\raw\xyz\1\1H2.xyz,,RDKit could not parse SMILES: c1(c(c(c(c(c1[PH...
4,data\raw\xyz\1\1H2CO.xyz,,RDKit could not parse SMILES: c1(c(c(c(c(c1P(c...
...,...,...,...
922,data\raw\xyz\99\99CO2.xyz,C[Si]1(C)(Cl)OC(=O)N2CCCN3CCCN1[C]32,
923,data\raw\xyz\99\99H2.xyz,C[SiH](C)(Cl)N1CCCN2CCCN[C]21,
924,data\raw\xyz\99\99H2CO.xyz,C[Si]1(C)(Cl)OCN2CCCN3CCCN1[C]32,
925,data\raw\xyz\99\99H2O.xyz,C[Si](C)(O)(Cl)N1CCCN2CCCN[C]21,


In [3]:
smiles_df = smiles_df[smiles_df["smiles"].notna()].copy()
print("Rows with valid SMILES:", len(smiles_df))
smiles_df.head()


Rows with valid SMILES: 327


Unnamed: 0,xyz_path,smiles,error
0,data\raw\xyz\1\1.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
1,data\raw\xyz\1\1CH3OH.xyz,CO.Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c...,
2,data\raw\xyz\1\1CO2.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
5,data\raw\xyz\1\1H2O.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
7,data\raw\xyz\10\10.xyz,Bc1ccccc1N1C(C)(C)CCCC1(C)C,


In [4]:
energy_df = pd.read_csv("../data/raw/graphs_csv/CO2.csv")
energy_df.head()
energy_df.columns


Index(['FLPID', 'OLD ID', 'LA-LB distance (A)', 'CO2', 'E_CO2', 'H_CO2',
       'G_CO2', 'E_CO2_sol', 'H_CO2_sol', 'G_CO2_sol'],
      dtype='object')

In [6]:
def is_co2_xyz(path_str: str) -> bool:
    name = Path(path_str).name  # e.g. "1CO2.xyz"
    return "CO2" in name  # or name.endswith("CO2.xyz")

# 1) filter rows that are CO2 xyz files
co2_df = smiles_df[smiles_df["xyz_path"].apply(is_co2_xyz)].copy()
print("Total rows in full CSV:", len(smiles_df))
print("Rows with CO2 xyz:", len(co2_df))
co2_df

Total rows in full CSV: 327
Rows with CO2 xyz: 24


Unnamed: 0,xyz_path,smiles,error
2,data\raw\xyz\1\1CO2.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
16,data\raw\xyz\103\103CO2.xyz,Cc1cc(C)c(B2OC(=O)P(c3c(C)cc(C)cc3C)c3cccc4ccc...,
23,data\raw\xyz\104\104CO2.xyz,Cc1cccc(C)c1B1OC(=O)P(c2c(C)cccc2C)c2cccc3cccc...,
114,data\raw\xyz\120\120CO2.xyz,CC1(C)CCCC(C)(C)N1CCCB(c1c(F)c(F)c(F)c(F)c1F)c...,
128,data\raw\xyz\123\123CO2.xyz,CC1(C)OB(c2ccccc2P(c2ccccc2)c2ccccc2)OC1(C)C.O...,
135,data\raw\xyz\124\124CO2.xyz,CC1(C)COB(c2ccccc2P(c2ccccc2)c2ccccc2)OC1.O=C=O,
149,data\raw\xyz\126\126CO2.xyz,CC(C)P(c1ccccc1B1Oc2ccccc2O1)C(C)C.O=C=O,
177,data\raw\xyz\130\130CO2.xyz,Cc1cc(C)c(B(c2cc(C)c(C)cc2C)c2ccccc2N(C)C)cc1C...,
254,data\raw\xyz\140\140CO2.xyz,CC(C)(C)P(c1ccc([C@H]2[C@H](B(c3c(F)c(F)c(F)c(...,
261,data\raw\xyz\141\141CO2.xyz,CC1(C)[C@@H]2CC[C@@]1(C)[C@H](N1CCCCC1)[C@H]2B...,


In [8]:
import pandas as pd
from pathlib import Path

smiles_df = pd.read_csv("../data/processed/xyz_smiles_map.csv")
print("Total rows in CSV:", len(smiles_df))
smiles_df.head()


Total rows in CSV: 927


Unnamed: 0,xyz_path,smiles,error
0,data\raw\xyz\1\1.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
1,data\raw\xyz\1\1CH3OH.xyz,CO.Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c...,
2,data\raw\xyz\1\1CO2.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...,
3,data\raw\xyz\1\1H2.xyz,,RDKit could not parse SMILES: c1(c(c(c(c(c1[PH...
4,data\raw\xyz\1\1H2CO.xyz,,RDKit could not parse SMILES: c1(c(c(c(c(c1P(c...


In [11]:
import pandas as pd
from pathlib import Path

smiles_df = pd.read_csv("../data/processed/xyz_smiles_map.csv")
print("Total rows in CSV:", len(smiles_df))

smiles_df["filename"] = smiles_df["xyz_path"].apply(lambda p: Path(p).name)

co2_mask = smiles_df["filename"].str.lower().str.contains("co2")
co2_all = smiles_df[co2_mask].copy()

print("Total CO2 xyz rows (no SMILES filter):", len(co2_all))
print("CO2 rows with non-null SMILES:", co2_all["smiles"].notna().sum())
print("CO2 rows with NULL SMILES:", co2_all["smiles"].isna().sum())

co2_all[["xyz_path", "filename", "smiles"]]



Total rows in CSV: 927
Total CO2 xyz rows (no SMILES filter): 132
CO2 rows with non-null SMILES: 24
CO2 rows with NULL SMILES: 108


Unnamed: 0,xyz_path,filename,smiles
2,data\raw\xyz\1\1CO2.xyz,1CO2.xyz,Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c(B(c3c(...
9,data\raw\xyz\10\10CO2.xyz,10CO2.xyz,
16,data\raw\xyz\103\103CO2.xyz,103CO2.xyz,Cc1cc(C)c(B2OC(=O)P(c3c(C)cc(C)cc3C)c3cccc4ccc...
23,data\raw\xyz\104\104CO2.xyz,104CO2.xyz,Cc1cccc(C)c1B1OC(=O)P(c2c(C)cccc2C)c2cccc3cccc...
30,data\raw\xyz\105\105CO2.xyz,105CO2.xyz,
...,...,...,...
894,data\raw\xyz\95\95CO2.xyz,95CO2.xyz,
901,data\raw\xyz\96\96CO2.xyz,96CO2.xyz,
908,data\raw\xyz\97\97CO2.xyz,97CO2.xyz,
915,data\raw\xyz\98\98CO2.xyz,98CO2.xyz,


In [20]:
from pathlib import Path
import py3Dmol

# point this to an actual CO2 xyz file in your repo
xyz_path = Path("../data/raw/xyz/99/99CO2.xyz")  # change 1/1CO2.xyz if needed

xyz_str = xyz_path.read_text()

view = py3Dmol.view(width=500, height=400)
view.addModel(xyz_str, "xyz")  # tell py3Dmol it's XYZ format
view.setStyle({"stick": {}})   # stick representation
view.zoomTo()

view.show()


In [18]:
import pandas as pd

co2_smiles = pd.read_csv("../data/processed/co2_xyz_smiles_obabelonly.csv")
print("Total CO2 rows:", len(co2_smiles))
print("Non-null SMILES:", co2_smiles["smiles"].notna().sum())
print("Null SMILES:", co2_smiles["smiles"].isna().sum())

co2_smiles


Total CO2 rows: 132
Non-null SMILES: 132
Null SMILES: 0


Unnamed: 0,xyz_path,filename,smiles,error
0,data\raw\xyz\1\1CO2.xyz,1CO2.xyz,c1(c(c(c(c(c1P(c1c(cc(cc1C)C)C)c1c(cc(cc1C)C)C...,
1,data\raw\xyz\10\10CO2.xyz,10CO2.xyz,c12ccccc1[N]1(C(CCCC1(C)C)(C)C)C(=O)O[BH2]2,
2,data\raw\xyz\103\103CO2.xyz,103CO2.xyz,P1(c2cccc3cccc(B(c4c(cc(cc4C)C)C)OC1=O)c23)c1c...,
3,data\raw\xyz\104\104CO2.xyz,104CO2.xyz,P1(c2cccc3cccc(B(c4c(cccc4C)C)OC1=O)c23)c1c(cc...,
4,data\raw\xyz\105\105CO2.xyz,105CO2.xyz,[P]1(c2c3c([B](c4c(cc(cc4C)C)C)(c4c(cc(cc4C)C)...,
...,...,...,...,...
127,data\raw\xyz\95\95CO2.xyz,95CO2.xyz,[P@@H]1(/C(=C\C(C)(C)C)/[B](c2c(c(c(c(c2F)F)F)...,
128,data\raw\xyz\96\96CO2.xyz,96CO2.xyz,[P]1(c2c(c(c(c(c2F)F)F)F)F)(c2c(c(c(c(c2F)F)F)...,
129,data\raw\xyz\97\97CO2.xyz,97CO2.xyz,[N@@]12[C]3[N@](CCCN3C(=O)O[B@]32[C@H]2CCC[C@@...,
130,data\raw\xyz\98\98CO2.xyz,98CO2.xyz,[N@@]12[C]3[N@@](CCCN3C(=O)O[B]2(C2CCCCC2)C2CC...,
