In [23]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from rdkit import Chem 


data = pd.read_table('cardiotoxity-regression.txt', sep='\t')[['SMILES', 'AC50']]
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33354 entries, 0 to 33353
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMILES  32928 non-null  object 
 1   AC50    14493 non-null  float64
dtypes: float64(1), object(1)
memory usage: 521.3+ KB


In [24]:
data = data.dropna(subset=['SMILES'])
data = data.drop_duplicates(subset=['SMILES'], keep='last')
data = data.fillna(0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7690 entries, 1209 to 33353
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMILES  7690 non-null   object 
 1   AC50    7690 non-null   float64
dtypes: float64(1), object(1)
memory usage: 180.2+ KB


In [None]:
def get_mol(x):
    try:
        return Chem.MolFromSmiles(x)
    except:
        return None
data['molecula'] = data['SMILES'].apply(get_mol) 

In [30]:
data = data.dropna(subset=['molecula'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7524 entries, 1209 to 33353
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SMILES    7524 non-null   object 
 1   AC50      7524 non-null   float64
 2   molecula  7524 non-null   object 
dtypes: float64(1), object(2)
memory usage: 235.1+ KB


In [36]:
from rdkit.Chem import Fragments

data['fr_Al_COO'] = data['molecula'].apply(Fragments.fr_Al_COO)
data['fr_Al_OH'] = data['molecula'].apply(Fragments.fr_Al_OH)
data['fr_ArN'] = data['molecula'].apply(Fragments.fr_ArN)
data['fr_Ar_COO'] = data['molecula'].apply(Fragments.fr_Ar_COO)
data['fr_Ar_N'] = data['molecula'].apply(Fragments.fr_Ar_N)
data['fr_Ar_NH'] = data['molecula'].apply(Fragments.fr_Ar_NH)
data['fr_Ar_OH'] = data['molecula'].apply(Fragments.fr_Ar_OH)
data['fr_COO'] = data['molecula'].apply(Fragments.fr_COO)
data['fr_COO2'] = data['molecula'].apply(Fragments.fr_COO2)
data['fr_C_O'] = data['molecula'].apply(Fragments.fr_C_O)
data['fr_aldehyde'] = data['molecula'].apply(Fragments.fr_aldehyde)
data['fr_amide'] = data['molecula'].apply(Fragments.fr_amide)
data['fr_amidine'] = data['molecula'].apply(Fragments.fr_amidine)
data['fr_aniline'] = data['molecula'].apply(Fragments.fr_aniline)
data['fr_aryl_methyl'] = data['molecula'].apply(Fragments.fr_aryl_methyl)
data['fr_azide'] = data['molecula'].apply(Fragments.fr_azide)
data['fr_barbitur'] = data['molecula'].apply(Fragments.fr_barbitur)
data['fr_benzene'] = data['molecula'].apply(Fragments.fr_benzene)


In [37]:
corr = data.corr()
plt.figure(figsize = (16,5))
sns.heatmap(corr, vmax=1, vmin=-1,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True, fmt=".3f")

Unnamed: 0,AC50,fr_Al_COO,fr_Al_OH,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_COO2,fr_C_O,fr_aldehyde,fr_amide,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_barbitur,fr_benzene
AC50,1.0,-0.047673,-0.038397,0.00204,-0.043564,-0.016063,-0.026427,0.035366,-0.063145,-0.063983,-0.025534,0.001873,-0.028964,0.010404,0.020098,-0.010935,-0.008158,-0.011498,0.11731
fr_Al_COO,-0.047673,1.0,0.031895,-0.027981,-0.044009,0.006869,-0.005832,-0.025908,0.89295,0.88992,0.419492,-0.034133,0.047706,0.018993,-0.004838,-0.015277,-0.004424,-0.009897,-0.002742
fr_Al_OH,-0.038397,0.031895,1.0,-0.040551,-0.035772,-0.007417,-0.002743,0.019788,0.012995,0.012599,0.030208,-0.024281,-0.004803,0.006282,-0.043195,-0.068699,0.003607,-0.010623,-0.101989
fr_ArN,0.00204,-0.027981,-0.040551,1.0,-0.015033,0.149124,0.018171,-0.035962,-0.032314,-0.032773,-0.086221,-0.033111,-0.024067,0.019641,0.540595,0.038891,-0.004034,-0.009024,0.106227
fr_Ar_COO,-0.043564,-0.044009,-0.035772,-0.015033,1.0,0.031675,-0.016095,0.011644,0.410423,0.408924,0.147934,-0.018678,-0.011644,0.013785,0.051554,-0.002954,-0.002814,-0.006295,0.083987
fr_Ar_N,-0.016063,0.006869,-0.007417,0.149124,0.031675,1.0,0.294756,-0.059997,0.020542,0.019615,-0.061758,-0.050517,0.023111,-0.012496,0.221042,0.252172,0.043001,-0.0144,-0.016747
fr_Ar_NH,-0.026427,-0.005832,-0.002743,0.018171,-0.016095,0.294756,1.0,-0.023905,-0.012576,-0.012799,-0.051214,-0.015604,-0.026985,-0.010516,-0.006943,0.025821,0.051465,-0.004253,-0.016046
fr_Ar_OH,0.035366,-0.025908,0.019788,-0.035962,0.011644,-0.059997,-0.023905,1.0,-0.018402,-0.018891,0.030388,0.007507,-0.04945,-0.015358,-0.055763,0.02892,-0.003983,-0.008912,0.290255
fr_COO,-0.063145,0.89295,0.012995,-0.032314,0.410423,0.020542,-0.012576,-0.018402,1.0,0.99656,0.449561,-0.039572,0.038298,0.023547,0.018814,-0.015276,-0.005306,-0.01187,0.035341
fr_COO2,-0.063983,0.88992,0.012599,-0.032773,0.408924,0.019615,-0.012799,-0.018891,0.99656,1.0,0.448927,-0.03976,0.037796,0.023284,0.017848,-0.016075,-0.005327,-0.011918,0.033299
