In [2]:
!pip install pandas numpy scikit-learn requests rdkit



In [24]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import requests
import tarfile
import os

In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import requests
import tarfile
import os

In [4]:
# Step 1: Download the QM9 dataset
url = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
response = requests.get(url, stream=True)
with open('gdb9.tar.gz', 'wb') as file:
    file.write(response.raw.read())

In [5]:
# Step 2: Extract the dataset
with tarfile.open('gdb9.tar.gz') as tar:
    tar.extractall()

In [6]:
df = pd.read_csv("gdb9.sdf.csv")
df.head()

Unnamed: 0,mol_id,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.64329,-401.014647,-372.471772
1,gdb_2,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028


In [11]:
# Step 3: Extract features and target variable (dipole moment)
X = df.drop(columns=['mol_id', 'mu']).values
y = df['mu'].values

In [12]:
# Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Step 5: Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=3, random_state=42)


In [15]:
model.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=500, random_state=42)

In [16]:
y_train_pred = model.predict(X_train)
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f'Train MAE: {train_mae}')

y_test_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f'Test MAE: {test_mae}')

Train MAE: 0.8080036510766141
Test MAE: 0.8187040859943444


In [20]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

In [27]:
def predict_dipole_moment(smiles):
    mol = Chem.MolFromSmiles(smiles)
    AllChem.Compute2DCoords(mol)
    
    # Extract features
    A = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    B = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    C = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    alpha = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    homo = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    lumo = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    gap = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    r2 = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    zpve = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    u0 = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    u298 = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    h298 = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    g298 = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    cv = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    u0_atom = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    u298_atom = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    h298_atom = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property
    g298_atom = Descriptors.MolWt(mol)  # Placeholder, should compute appropriate property


    features = np.array([A, B, C, alpha, homo, lumo, gap, r2, zpve, u0, u298, h298, g298, cv, u0_atom, u298_atom, h298_atom, g298_atom]).reshape(1, -1)
    standardized_features = scaler.transform(features)
    prediction = model.predict(standardized_features)
    return prediction[0]


In [28]:
predict_dipole_moment("CCO")

-0.2650618363373659