In [1]:
!pip install rdkit pandas numpy scikit-learn requests

Collecting rdkit
  Downloading rdkit-2024.3.3-cp38-cp38-macosx_10_13_x86_64.whl.metadata (3.9 kB)
Collecting numpy
  Downloading numpy-1.22.4-cp38-cp38-macosx_10_15_x86_64.whl.metadata (2.0 kB)
Downloading rdkit-2024.3.3-cp38-cp38-macosx_10_13_x86_64.whl (29.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.5/29.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading numpy-1.22.4-cp38-cp38-macosx_10_15_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: numpy, rdkit
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
Successfully installed numpy-1.22.4 rdkit-2024.3.3


In [10]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import requests
import tarfile
import os

In [11]:
# Step 1: Download the QM9 dataset
url = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
response = requests.get(url, stream=True)
with open('gdb9.tar.gz', 'wb') as file:
    file.write(response.raw.read())

In [12]:
# Step 2: Extract the dataset
with tarfile.open('gdb9.tar.gz') as tar:
    tar.extractall()

In [14]:
# Step 3: Load the dataset
file_path = 'gdb9.sdf'
supplier = Chem.SDMolSupplier(file_path)
molecules = [mol for mol in supplier if mol is not None]

[17:40:27] Explicit valence for atom # 1 C, 5, is greater than permitted
[17:40:27] ERROR: Could not sanitize molecule ending on line 9097
[17:40:27] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[17:40:27] Explicit valence for atom # 1 C, 5, is greater than permitted
[17:40:27] ERROR: Could not sanitize molecule ending on line 35785
[17:40:27] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[17:40:28] Explicit valence for atom # 4 C, 5, is greater than permitted
[17:40:28] ERROR: Could not sanitize molecule ending on line 62866
[17:40:28] ERROR: Explicit valence for atom # 4 C, 5, is greater than permitted
[17:40:28] Explicit valence for atom # 2 C, 5, is greater than permitted
[17:40:28] ERROR: Could not sanitize molecule ending on line 66832
[17:40:28] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[17:40:28] Explicit valence for atom # 2 C, 5, is greater than permitted
[17:40:28] ERROR: Could not sanitize molecule en

In [23]:
print(list(molecules[0].GetPropNames()))

[]


In [None]:
# Extracting features and dipole moments
features = []
dipole_moments = []

for mol in molecules:
    AllChem.Compute2DCoords(mol)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    features.append(np.array(fingerprint))
    
    dipole_moment = float(mol.GetProp('dipole_moment'))
    dipole_moments.append(dipole_moment)

X = np.array(features)
y = np.array(dipole_moments)

In [None]:
# Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)