In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('tetrahymena_padel_withDep.csv')

In [3]:
df.head()

Unnamed: 0,Name,Dependent,nAcid,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nB,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,AUTOGEN_tetrahymena_smiles_1,-0.16,0,10.48,0,0,5,5,0,0,...,30.375082,8.5,1.7,9.0,0.0,0.0,16,0,2.864,20
1,AUTOGEN_tetrahymena_smiles_2,1.64,0,8.847586,0,0,7,5,2,0,...,19.846699,8.696802,1.73936,9.806844,4.626105,2.851624,18,2,0.927,16
2,AUTOGEN_tetrahymena_smiles_3,-2.72,0,5.229172,0,0,6,2,4,0,...,5.337702,3.0,1.5,2.0,2.0,0.0,1,0,-0.499,2
3,AUTOGEN_tetrahymena_smiles_4,-0.87,0,7.429172,0,0,8,4,4,0,...,7.504045,6.87132,1.71783,7.181981,2.31066,4.87132,10,1,-1.524,10
4,AUTOGEN_tetrahymena_smiles_5,-1.32,0,9.527172,0,0,8,4,4,0,...,9.50119,6.732051,1.683013,6.732051,0.0,4.488034,9,0,-0.801,12


In [4]:
# DataFrame of just the descriptors

X = df.drop(['Name', 'Dependent'], axis = 1)

In [5]:
# Series of just the dependent

y = df['Dependent']

In [6]:
# Removal of all columns (descriptors) where there are any missing values

X.dropna(axis = 1, inplace = True)

In [7]:
# Checking to see how many columns removed

X.shape

(1995, 1341)

In [8]:
from sklearn.feature_selection import VarianceThreshold

In [9]:
# Selecting variance threshold, i.e. remove descriptors with a variance less than 0.01

selector = VarianceThreshold(0.01)

In [10]:
# X[X.columns[selector.get_support(indices=True)]]
# selector.get_support(indices=True) returns a list of indexes for columns that weren't dropped
# X.columns[selector.get_support(indices=True) returns a list of all the column names
# Lastly, X[X.columns[selector.get_support(indices=True)]] similar to just saying X[['col1','col2']] 
# returns the DataFrame but with just those selected columns (i.e. the ones not dropped)

selector.fit(X)
X_var = X[X.columns[selector.get_support(indices=True)]]

In [11]:
# Creating a new DataFrame containing the dependent and descriptors that are retained

df_clean1 = pd.concat([y, X_var], axis=1)

In [12]:
df_clean1.head()

Unnamed: 0,Dependent,nAcid,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,nN,...,MW,AMW,WTPT-1,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,-0.16,0,10.48,0,0,5,5,0,1,0,...,151.875411,30.375082,8.5,9.0,0.0,0.0,16,0,2.864,20
1,1.64,0,8.847586,0,0,7,5,2,1,1,...,138.92689,19.846699,8.696802,9.806844,4.626105,2.851624,18,2,0.927,16
2,-2.72,0,5.229172,0,0,6,2,4,1,0,...,32.026215,5.337702,3.0,2.0,2.0,0.0,1,0,-0.499,2
3,-0.87,0,7.429172,0,0,8,4,4,1,2,...,60.032363,7.504045,6.87132,7.181981,2.31066,4.87132,10,1,-1.524,10
4,-1.32,0,9.527172,0,0,8,4,4,1,2,...,76.009519,9.50119,6.732051,6.732051,0.0,4.488034,9,0,-0.801,12


In [13]:
# Creating two correlation matrices
# X_corr - correlation of all the descriptors
# y_corr - correlation of all the descriptors to the dependent

X_corr = X_var.corr().abs()
y_corr = df_clean1.corr()['Dependent'].drop('Dependent').abs()

In [14]:
X_corr.head()

Unnamed: 0,nAcid,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,nN,nO,...,MW,AMW,WTPT-1,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
nAcid,1.0,0.01855,0.08903,0.087435,0.003694,0.008798,0.000395,0.038357,0.119958,0.203195,...,0.022056,0.009613,0.003417,0.060113,0.177695,0.115848,0.028133,0.03444,0.056004,0.017256
apol,0.01855,1.0,0.27754,0.281962,0.94145,0.837675,0.742879,0.920689,0.150397,0.15684,...,0.622084,0.21815,0.82286,0.015571,0.182425,0.137359,0.845034,0.640791,0.72624,0.713496
naAromAtom,0.08903,0.27754,1.0,0.997745,0.035838,0.563983,0.291338,0.524382,0.23556,0.034346,...,0.400042,0.234822,0.62746,0.172816,0.02934,0.257973,0.371444,0.688519,0.477441,0.730745
nAromBond,0.087435,0.281962,0.997745,1.0,0.04178,0.561656,0.282075,0.530639,0.234,0.04476,...,0.393094,0.22355,0.626464,0.157807,0.039744,0.257127,0.370452,0.687392,0.483077,0.729255
nAtom,0.003694,0.94145,0.035838,0.04178,1.0,0.699716,0.90324,0.826523,0.180767,0.199242,...,0.3951,0.460715,0.670315,0.113683,0.223619,0.172685,0.76014,0.45366,0.575468,0.527367


In [15]:
y_corr.head()

nAcid         0.109151
apol          0.471877
naAromAtom    0.326745
nAromBond     0.325695
nAtom         0.289271
Name: Dependent, dtype: float64

In [28]:
# Removal of correlated descriptors
# If two descriptors have a correlation greater than or equal to 0.9, and the columns hasn't appeared before
# Add the column, whom has the lowest correlation with the dependent, to a list.

col_corr_90 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.9) and (X_corr.columns[j] not in col_corr_90):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_90.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_90.add(colname)

In [29]:
# Remove the columns from the DataFrame that are correlated

df_clean_90 = df_clean1.drop(col_corr_90, axis = 1)

In [30]:
# Checking to see how many columns removed

df_clean_90.shape

(1995, 448)

In [21]:
df_clean_90.head()

Unnamed: 0,Dependent,nN,nX,ATS0m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,...,VR3_D,TopoPSA,SRW9,MW,AMW,WTPT-3,WTPT-4,WPATH,WPOL,XLogP
0,-0.16,0,4,5171.074121,7540.215,0.0,0.0,0.0,0.0,0.0,...,7.600902,0.0,0.0,151.875411,30.375082,9.0,0.0,16,0,2.864
1,1.64,1,1,7239.077516,1949.851947,2621.27616,0.0,0.0,0.0,0.0,...,1.257878,43.14,0.0,138.92689,19.846699,9.806844,4.626105,18,2,0.927
2,-2.72,0,0,404.296378,63.536256,3.048192,0.0,0.0,0.0,0.0,...,0.069315,20.23,0.0,32.026215,5.337702,2.0,2.0,1,0,-0.499
3,-0.87,2,0,796.688476,478.062438,281.606409,34.286112,0.0,0.0,0.0,...,0.853778,55.12,0.0,60.032363,7.504045,7.181981,2.31066,10,1,-1.524
4,-1.32,2,0,1568.564075,1144.785369,185.742144,4.064256,0.0,0.0,0.0,...,5.965649,84.13,0.0,76.009519,9.50119,6.732051,0.0,9,0,-0.801


In [31]:
df_clean_90.to_csv('Tetrahymena_SJB_90')

In [32]:
# 0.8

col_corr_80 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.8) and (X_corr.columns[j] not in col_corr_80):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_80.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_80.add(colname)

In [33]:
df_clean_80 = df_clean1.drop(col_corr_80, axis = 1)

In [34]:
df_clean_80.shape

(1995, 257)

In [35]:
df_clean_80.head()

Unnamed: 0,Dependent,nN,nS,ATS4m,AATS5m,AATS6m,AATS7m,AATS8m,AATS1v,AATS4v,...,topoShape,GGI5,JGI1,VE3_D,VR3_D,TopoPSA,AMW,WTPT-3,WTPT-4,XLogP
0,-0.16,0,0,0.0,0.0,0.0,0.0,0.0,461.995908,0.0,...,1.0,0.0,0.75,-18.3684,7.600902,0.0,30.375082,9.0,0.0,2.864
1,1.64,1,0,0.0,0.0,0.0,0.0,0.0,259.201094,0.0,...,0.5,0.0,0.375,-1.399999,1.257878,43.14,19.846699,9.806844,4.626105,0.927
2,-2.72,0,0,0.0,0.0,0.0,0.0,0.0,145.790596,0.0,...,0.0,0.0,0.0,0.0,0.069315,20.23,5.337702,2.0,2.0,-0.499
3,-0.87,2,0,34.286112,0.0,0.0,0.0,0.0,177.527028,56.548688,...,0.5,0.0,0.166667,0.0,0.853778,55.12,7.504045,7.181981,2.31066,-1.524
4,-1.32,2,1,4.064256,0.0,0.0,0.0,0.0,213.231888,31.083744,...,1.0,0.0,0.5,-14.69472,5.965649,84.13,9.50119,6.732051,0.0,-0.801


In [36]:
df_clean_80.to_csv('Tetrahymena_SJB_80')

In [37]:
# 0.7

col_corr_70 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.7) and (X_corr.columns[j] not in col_corr_70):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_70.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_70.add(colname)

In [38]:
df_clean_70 = df_clean1.drop(col_corr_70, axis = 1)

In [39]:
df_clean_70.shape

(1995, 151)

In [40]:
df_clean_70.head()

Unnamed: 0,Dependent,nN,nS,AATS5m,AATS6m,AATS7m,AATS6v,AATS4e,AATS5e,AATS1p,...,nTG12Ring,nHeteroRing,nT5HeteroRing,nT10HeteroRing,LipinskiFailures,topoShape,JGI1,VE3_D,WTPT-3,WTPT-4
0,-0.16,0,0,0.0,0.0,0.0,0.0,0.0,0.0,3.6406,...,0,0,0,0,0,1.0,0.75,-18.3684,9.0,0.0
1,1.64,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.820331,...,0,0,0,0,0,0.5,0.375,-1.399999,9.806844,4.626105
2,-2.72,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.042948,...,0,0,0,0,0,0.0,0.0,0.0,2.0,2.0
3,-0.87,2,0,0.0,0.0,0.0,0.0,8.094816,0.0,1.100043,...,0,0,0,0,0,0.5,0.166667,0.0,7.181981,2.31066
4,-1.32,2,1,0.0,0.0,0.0,0.0,6.718464,0.0,1.635841,...,0,0,0,0,0,1.0,0.5,-14.69472,6.732051,0.0


In [41]:
df_clean_70.to_csv('Tetrahymena_SJB_70')

In [42]:
col_corr_95 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.95) and (X_corr.columns[j] not in col_corr_95):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_95.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_95.add(colname)

In [35]:
df_clean_95 = df_clean1.drop(col_corr_95, axis = 1)

In [36]:
df_clean_95.shape

(1995, 437)

In [37]:
df_clean_95.head()

Unnamed: 0,Dependent,nN,nX,ATS0m,ATS2m,ATS3m,ATS4m,ATS5m,ATS6m,ATS7m,...,TopoPSA,VAdjMat,SRW6,SRW9,MW,AMW,WTPT-3,WTPT-4,WPOL,XLogP
0,-0.16,0,4,5171.074121,7540.215,0.0,0.0,0.0,0.0,0.0,...,0.0,3.321928,4.859812,0.0,151.875411,30.375082,9.0,0.0,0,2.864
1,1.64,1,1,7239.077516,1949.851947,2621.27616,0.0,0.0,0.0,0.0,...,43.14,3.321928,4.394449,0.0,138.92689,19.846699,9.806844,4.626105,2,0.927
2,-2.72,0,0,404.296378,63.536256,3.048192,0.0,0.0,0.0,0.0,...,20.23,2.0,1.098612,0.0,32.026215,5.337702,2.0,2.0,0,-0.499
3,-0.87,2,0,796.688476,478.062438,281.606409,34.286112,0.0,0.0,0.0,...,55.12,3.0,3.610918,0.0,60.032363,7.504045,7.181981,2.31066,1,-1.524
4,-1.32,2,0,1568.564075,1144.785369,185.742144,4.064256,0.0,0.0,0.0,...,84.13,3.0,4.007333,0.0,76.009519,9.50119,6.732051,0.0,0,-0.801


In [38]:
df_clean_95.to_csv('Tetrahymena_SJB_95')

In [43]:
df_clean1.shape

(1995, 937)

In [44]:
df_clean1.head()

Unnamed: 0,Dependent,nAcid,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,nN,...,MW,AMW,WTPT-1,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,-0.16,0,10.48,0,0,5,5,0,1,0,...,151.875411,30.375082,8.5,9.0,0.0,0.0,16,0,2.864,20
1,1.64,0,8.847586,0,0,7,5,2,1,1,...,138.92689,19.846699,8.696802,9.806844,4.626105,2.851624,18,2,0.927,16
2,-2.72,0,5.229172,0,0,6,2,4,1,0,...,32.026215,5.337702,3.0,2.0,2.0,0.0,1,0,-0.499,2
3,-0.87,0,7.429172,0,0,8,4,4,1,2,...,60.032363,7.504045,6.87132,7.181981,2.31066,4.87132,10,1,-1.524,10
4,-1.32,0,9.527172,0,0,8,4,4,1,2,...,76.009519,9.50119,6.732051,6.732051,0.0,4.488034,9,0,-0.801,12


In [45]:
df_clean1.to_csv('Tetrahymena_SJB_100')

In [46]:
col_corr_60 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.60) and (X_corr.columns[j] not in col_corr_60):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_60.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_60.add(colname)

In [47]:
df_clean_60 = df_clean1.drop(col_corr_60, axis = 1)

In [48]:
df_clean_60.head()

Unnamed: 0,Dependent,nN,AATS4i,AATS7s,ATSC1m,ATSC2m,ATSC6m,ATSC7m,ATSC8m,ATSC7e,...,nT10Ring,nHeteroRing,nT5HeteroRing,nT10HeteroRing,LipinskiFailures,topoShape,JGI1,VR3_D,AMW,WTPT-3
0,-0.16,0,0.0,0.0,-351.607501,131.852813,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1.0,0.75,7.600902,30.375082,9.0
1,1.64,1,0.0,0.0,-79.613614,-1965.982853,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.5,0.375,1.257878,19.846699,9.806844
2,-2.72,0,0.0,0.0,-61.775139,-111.122906,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.069315,5.337702,2.0
3,-0.87,2,185.050964,0.0,-46.228028,-126.724003,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.5,0.166667,0.853778,7.504045,7.181981
4,-1.32,2,184.917652,0.0,-74.148782,282.528425,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1.0,0.5,5.965649,9.50119,6.732051


In [49]:
df_clean_60.to_csv('Tetrahymena_SJB_60')

In [50]:
col_corr_50 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.50) and (X_corr.columns[j] not in col_corr_50):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_50.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_50.add(colname)

In [51]:
df_clean_50 = df_clean1.drop(col_corr_50, axis = 1)

In [52]:
df_clean_50.head()

Unnamed: 0,Dependent,AATS7s,ATSC2m,ATSC6m,ATSC8m,ATSC7e,ATSC7i,ATSC4s,ATSC5s,ATSC6s,...,MDEN-12,MLFER_A,nT9Ring,nT10Ring,nHeteroRing,topoShape,JGT,VR3_D,AMW,WTPT-5
0,-0.16,0.0,131.852813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.003,0,0,0,1.0,0.75,7.600902,30.375082,0.0
1,1.64,0.0,-1965.982853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.003,0,0,0,0.5,0.486111,1.257878,19.846699,2.851624
2,-2.72,0.0,-111.122906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.348,0,0,0,0.0,0.0,0.069315,5.337702,0.0
3,-0.87,0.0,-126.724003,0.0,0.0,0.0,0.0,-8.984375,0.0,0.0,...,1.0,0.003,0,0,0,0.5,0.277778,0.853778,7.504045,4.87132
4,-1.32,0.0,282.528425,0.0,0.0,0.0,0.0,3.030178,0.0,0.0,...,0.0,0.357,0,0,0,1.0,0.5,5.965649,9.50119,4.488034


In [53]:
df_clean_50.to_csv('Tetrahymena_SJB_50')

In [14]:
col_corr_40 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.40) and (X_corr.columns[j] not in col_corr_40):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_40.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_40.add(colname)

In [15]:
df_clean_40 = df_clean1.drop(col_corr_40, axis = 1)

In [16]:
df_clean_40.shape

(1995, 36)

In [18]:
df_clean_40.to_csv('Tetrahymena_SJB_40')

In [16]:
col_corr_30 = set()

for i in range(len(X_corr.columns)):
    
    for j in range(i):
        
        if (X_corr.iloc[i, j] >= 0.30) and (X_corr.columns[j] not in col_corr_30):
            
            if (y_corr.iloc[i] >= y_corr.iloc[j]):
                colname = X_corr.columns[j]
                col_corr_30.add(colname)
            else:
                colname = X_corr.columns[i]
                col_corr_30.add(colname)

In [17]:
df_clean_30 = df_clean1.drop(col_corr_30, axis = 1)

In [18]:
df_clean_30.shape

(1995, 19)

In [19]:
df_clean_30.to_csv('Tetrahymena_SJB_30')