In [1]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import defaultdict

df = pd.read_csv("./smiles_train_set.csv")
df.head()

Unnamed: 0,blend_id,smiles,oil_property_param_value
0,49743a76-a614-11ee-9529-005056921581,CCCCC,103300.0
1,49743a76-a614-11ee-9529-005056921581,CCCC(C)CCC,103300.0
2,49743a76-a614-11ee-9529-005056921581,CCC(C(OC)=O)CC,103300.0
3,49743a76-a614-11ee-9529-005056921581,CCCCC(C)C,103300.0
4,49743a76-a614-11ee-9529-005056921581,CC(C)(C)CC(C)(C)C,103300.0


In [75]:
import numpy as np

dct_oil = defaultdict(list)
dct_param = defaultdict(float)
for idx in df.index:
    s = df["smiles"][idx].strip()
    mol = Chem.MolFromSmiles(s)
    if mol != None: # ошибка для 'O=S(C1=CC=C([C18H21])C=C1)(O)=O'
        fp = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096))
        k = np.array(fp, dtype=int)
        blend_id = df["blend_id"][idx]
        dct_oil[blend_id].append(k)
        dct_param[blend_id] = df["oil_property_param_value"][idx]

data = pd.DataFrame()
for blend_id,mols in dct_oil.items():
    row = pd.DataFrame({"Molecules": [""], "oil_property_param_value": [dct_param[blend_id]]})
    row.at[0, "Molecules"] = dct_oil[blend_id]
    data = pd.concat([data, row], ignore_index=True)

[20:49:19] Explicit valence for atom # 5 C, 5, is greater than permitted
[20:49:19] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:19] SMILES Parse Error: syntax error while parsing: O=S(C1=CC=C([C18H21])C=C1)(O)=O
[20:49:19] SMILES Parse Error: Failed parsing SMILES 'O=S(C1=CC=C([C18H21])C=C1)(O)=O' for input: 'O=S(C1=CC=C([C18H21])C=C1)(O)=O'
[20:49:19] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:19] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:19] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:20] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:20] Explicit valence for atom # 6 C, 5, is greater than permitted
[20:49:20] SMILES Parse Error: syntax error while parsing: O=S(C1=CC=C([C18H21])C=C1)(O)=O
[20:49:20] SMILES Parse Error: Failed parsing SMILES 'O=S(C1=CC=C([C18H21])C=C1)(O)=O' for input: 'O=S(C1=CC=C([C18H21])C=C1)(O)=O'
[20:49:20] Explicit valence for atom # 6 C,

In [76]:
Xd = data["Molecules"].apply(lambda x: np.sum(x, axis=0))
Xd = pd.DataFrame(Xd.to_list(), columns=list(range(len(X[0]))))
yd = data["oil_property_param_value"]

In [77]:
yd.to_string()

'0      103300.0\n1      103100.0\n2      161250.0\n3        6806.0\n4      116600.0\n5       90075.0\n6       26595.0\n7        7507.0\n8       12530.0\n9       86845.0\n10      14565.0\n11     125000.0\n12     131900.0\n13      34500.0\n14      12515.0\n15     131300.0\n16     385200.0\n17     111850.0\n18     853000.0\n19     107750.0\n20     173050.0\n21     129200.0\n22      72465.0\n23     147550.0\n24      46330.0\n25      21580.0\n26      13360.0\n27      10650.0\n28     189900.0\n29     127350.0\n30      38060.0\n31     131000.0\n32      33065.0\n33      83760.0\n34     143800.0\n35       8222.0\n36      16225.0\n37      61500.0\n38     119200.0\n39     198300.0\n40      17035.0\n41      22610.0\n42      25650.0\n43       7678.0\n44     130450.0\n45      13565.0\n46      10010.0\n47     145200.0\n48     131700.0\n49       6738.0\n50      31560.0\n51      12890.0\n52       7369.0\n53      28325.0\n54      20325.0\n55       8607.0\n56     174300.0\n57     121950.0\n58          N

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB

def print_metrics(X_test, y_test, clf):
    y_pred = clf.predict(X_test)
    evs = explained_variance_score(y_test, y_pred)
    print("EVS:", evs)

In [83]:
X = Xd.to_numpy()
y = yd.to_numpy()
y[np.isnan(y)] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
clf = GaussianNB()
clf.fit(X_train, y_train)
print_metrics(X_test, y_test, clf)

EVS: 0.6699758193356073


In [94]:
X = Xd.to_numpy()
y = yd.to_numpy()
y[np.isnan(y)] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
clf = SVR(kernel="poly", C=100.0, epsilon=0.1)
clf.fit(X_train, y_train)
print_metrics(X_test, y_test, clf)

EVS: 0.03942798381198087
