In [1]:
import os
import pandas as pd
import numpy as np
import copy
import seaborn as sns

from collections import Counter
from imblearn.over_sampling import ADASYN

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

from rdkit import RDLogger  
RDLogger.DisableLog('rdApp.*') 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics

from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

print(rdkit.__version__)

2021.03.2


# Loading CSV'S

In [2]:
data_train = pd.read_csv("data_train.csv",index_col=0).reset_index(drop=True)
smiles_test = pd.read_csv("smiles_test.csv",index_col=0).reset_index(drop=True)
sample_sub = pd.read_csv("sample_submission.csv",index_col=0).reset_index(drop=True)

# Data Inspection

In [3]:
data_train

Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


In [4]:
smiles_test

Unnamed: 0,smiles
0,OC(COc1ccc(Cl)cc1)=N[C@H]1CC[C@H](N=C(O)COc2cc...
1,CCCO/N=C(/C)c1cc(C(O)=NC(Cc2cc(F)cc(F)c2)[C@@H...
2,COc1cc(Cl)ccc1Cl
3,COc1cc(C(O)=NCc2ccc(OCCN(C)C)cc2)cc(OC)c1OC
4,CCC(=O)O[C@@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]...
...,...
5891,N#Cc1cc(NC(=O)C(=O)O)c(Cl)c(NC(=O)C(=O)O)c1.NC...
5892,O=c1cccc2n1C[C@@H]1CNC[C@H]2C1
5893,CSCC[C@H](N=C(O)[C@H](Cc1ccccc1)N=C(O)CN=C(O)C...
5894,CCn1cc2c3c(cc(C(O)=NC(Cc4ccccc4)[C@H](O)C[NH2+...


In [5]:
sample_sub

Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,0.965388,0.669021,0.364129,0.248534,0.082723,0.101662,0.894853,0.099291,0.931158,0.132221,0.617906
1,0.972610,0.986971,0.060073,0.286885,0.865854,0.805776,0.481583,0.715330,0.388927,0.998184,0.378946
2,0.366591,0.275695,0.063553,0.966171,0.442205,0.969089,0.509688,0.540241,0.441256,0.164225,0.070570
3,0.475604,0.490168,0.755998,0.477857,0.371955,0.947405,0.280805,0.872361,0.513712,0.570384,0.990165
4,0.034529,0.669413,0.480047,0.011377,0.747641,0.272674,0.322530,0.330088,0.929216,0.492997,0.496907
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.841416,0.832933,0.144299,0.092632,0.860756,0.797975,0.407141,0.819184,0.808753,0.693338,0.253581
5892,0.634844,0.643848,0.698586,0.211566,0.791034,0.462967,0.498234,0.265715,0.171268,0.524664,0.046151
5893,0.161446,0.419693,0.310739,0.977375,0.632457,0.645635,0.952371,0.000913,0.391865,0.986964,0.953342
5894,0.630445,0.798230,0.842443,0.188696,0.407885,0.308575,0.523217,0.240382,0.564827,0.343042,0.005972


In [6]:
data_train.shape

(12000, 12)

In [7]:
#check for NaN-values:

data_train.isnull().values.any()

False

# Calculate fingerprints of data-train and smiles_test

In [8]:
#fingerprints data_train:

fp_length = 420
fps = np.zeros((len(data_train), fp_length))

for i, smiles in enumerate(tqdm(data_train['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr

100%|██████████████████████████████████████████████████████████████████████████| 12000/12000 [00:04<00:00, 2649.07it/s]


In [9]:
#fingerprints smiles_test:

fp_length = 420
fps_test = np.zeros((len(smiles_test), fp_length))

for i, smiles in enumerate(tqdm(smiles_test['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps_test[i] = arr

100%|████████████████████████████████████████████████████████████████████████████| 5896/5896 [00:02<00:00, 2688.59it/s]


# looking into resulting arrays

In [10]:
pd.DataFrame(fps)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,410,411,412,413,414,415,416,417,418,419
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11996,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
11997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
11998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
fps.shape

(12000, 420)

In [12]:
pd.DataFrame(fps_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,410,411,412,413,414,415,416,417,418,419
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5892,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
fps_test.shape

(5896, 420)

# get predictions

In [14]:
y = data_train[data_train.columns[1:]]
y = y.to_numpy()
y.shape

(12000, 11)

In [15]:
pd.DataFrame(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,-1,0,0,0
1,0,0,0,0,0,0,0,-1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,-1,0,0,1
4,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
11995,0,0,0,0,0,0,0,-1,0,0,0
11996,0,0,0,0,0,0,0,-1,0,0,0
11997,0,0,0,-1,0,0,0,0,0,0,0
11998,0,0,0,0,0,0,-1,0,0,0,0


# how balanced is the dataset

In [16]:
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{-1: 15179, 0: 113123, 1: 3698}

In [17]:
# could need some oversampling or similar technique

# Train-Test-Split

In [18]:
def split_data(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

# Random Forrest

In [19]:
def train_rf(X_train, y_train, X_test):
    seed = 120
    n_tasks = y_train.shape[1]
    y_hats_proba = np.empty((X_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_proba)
    
    # Train RF per task
    for j in tqdm(range(n_tasks)):
        rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)
        # Mask out unknown samples
        idx = (y_train[:, j] != (0))
        # Train model
        rf_model.fit(X_train[idx], y_train[idx, j])
        # Predict class probabilities (select only values for positiv class with index 1)
        y_hats_proba[:, j] = rf_model.predict_proba(X_test)[:, 1]
        # Predict class 
        y_hats_class[:, j] = rf_model.predict(X_test)
    return y_hats_proba, y_hats_class 

In [20]:
X_train, X_test, y_train, y_test = split_data(fps,y)

y_hats_proba_rf, y_hats_class_rf = train_rf(X_train, y_train, fps_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00,  1.58it/s]


# look into predictions 

In [21]:
y_hats_class_rf_df = pd.DataFrame(y_hats_class_rf)
y_hats_class_rf_df 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
4,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
5891,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5892,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5893,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5894,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0


In [22]:
second_best_score = pd.DataFrame(y_hats_proba_rf)
second_best_score 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.140000,0.5500,0.61,0.06,0.180000,0.010,0.06,0.00,0.19,0.960,0.026667
1,0.523333,0.5000,0.50,0.25,0.320000,0.140,0.18,0.07,0.50,0.615,0.190000
2,0.140000,0.4300,0.47,0.02,0.120000,0.030,0.00,0.01,0.34,0.930,0.010000
3,0.290000,0.5400,0.78,0.13,0.200000,0.010,0.09,0.00,0.22,0.810,0.060000
4,0.332500,0.5900,0.42,0.16,0.280000,0.060,0.10,0.00,0.51,0.970,0.073000
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.427000,0.4000,0.66,0.15,0.203333,0.035,0.18,0.01,0.36,0.610,0.080000
5892,0.290000,0.5500,0.73,0.08,0.190000,0.030,0.10,0.00,0.24,0.810,0.030000
5893,0.310000,0.5200,0.64,0.14,0.180000,0.080,0.16,0.02,0.32,0.610,0.140000
5894,0.940000,0.4800,0.64,0.20,0.300000,0.135,0.21,0.00,0.44,0.575,0.130000


In [23]:
unique, counts = np.unique(y_hats_class_rf, return_counts=True)
dict(zip(unique, counts))

{-1.0: 50718, 1.0: 14138}

# AUC from challenge server

In [24]:
info = {'mean': [0.726], 'Task1': [0.896], 'Task2': [0.581], 'Task3': [0.635], 'Task4': [0.682], 'Task5': [0.596], 'Task6': [0.517], 'Task7': [0.803], 'Task8': [0.705], 'Task9': [0.878], 'Task10': [0.905], 'Task11': [0.791]}
AUC = pd.DataFrame(data=info)
AUC


Unnamed: 0,mean,Task1,Task2,Task3,Task4,Task5,Task6,Task7,Task8,Task9,Task10,Task11
0,0.726,0.896,0.581,0.635,0.682,0.596,0.517,0.803,0.705,0.878,0.905,0.791


# # get csv-file of proba

In [25]:
compression_opts = dict(method='zip',
                        archive_name='rf_proba.csv')  
second_best_score.to_csv('rf_proba.zip', index=True,
          compression=compression_opts)  

In [26]:
#until here i could be pleased with the results, 
#but let's try logistic regression to see if it outperforms random forrest at least on some of the tasks

# Logistic Regression

In [27]:
def train_logreg(X_train, y_train, X_test):
    n_tasks = y_train.shape[1]
    y_hats_proba = np.empty((X_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_proba)
    
    # Train RF per task
    for j in tqdm(range(n_tasks)):
        logreg = LogisticRegressionCV(cv=10, solver = 'lbfgs', max_iter = 1000)
        # Mask out unknown samples
        idx = (y_train[:, j] != (0))
        # Train model
        logreg.fit(X_train[idx], y_train[idx, j])
        # Predict class probabilities (select only values for positiv class with index 1)
        y_hats_proba[:, j] = logreg.predict_proba(X_test)[:, 1]
        # Predict class 
        y_hats_class[:, j] = logreg.predict(X_test)
    return y_hats_proba, y_hats_class 

In [28]:
X_train, X_test, y_train, y_test = split_data(fps,y)

y_hats_proba_lr, y_hats_class_lr = train_logreg(X_train, y_train, fps_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:51<00:00,  4.64s/it]


# look into predictions

In [29]:
y_hats_class_lr_df = pd.DataFrame(y_hats_class_lr)
y_hats_class_lr_df 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
1,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
2,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
4,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
5891,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5892,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5893,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5894,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0


In [30]:
best_score = pd.DataFrame(y_hats_proba_lr)
best_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.065264,0.408691,0.665734,0.121830,0.186422,0.060957,0.036447,0.013194,0.191059,0.851552,0.074435
1,0.618457,0.267257,0.669047,0.061802,0.187341,0.061044,0.055853,0.013259,0.493655,0.652518,0.075079
2,0.093052,0.093584,0.664795,0.010973,0.186783,0.061163,0.055384,0.013173,0.211630,0.959022,0.074167
3,0.199352,0.834065,0.669326,0.003915,0.186049,0.061083,0.027471,0.013180,0.137831,0.769160,0.074852
4,0.441117,0.924269,0.665680,0.030335,0.186129,0.061224,0.112634,0.013177,0.273222,0.984535,0.074818
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.184585,0.770091,0.666737,0.100762,0.186766,0.061090,0.109496,0.013128,0.475795,0.370841,0.074491
5892,0.086861,0.598885,0.667486,0.025080,0.186530,0.061058,0.063837,0.013129,0.138365,0.814694,0.074392
5893,0.166643,0.033559,0.668321,0.014232,0.185899,0.060982,0.063631,0.013173,0.111477,0.313672,0.074610
5894,0.920875,0.055271,0.670759,0.035000,0.186249,0.060939,0.026464,0.013179,0.449643,0.504185,0.075070


In [31]:
unique, counts = np.unique(y_hats_class_lr, return_counts=True)
dict(zip(unique, counts))

{-1.0: 49525, 1.0: 15331}

# AUC Score from challenge server

In [32]:
info_lr = {'mean': [0.73], 'Task1': [0.889], 'Task2': [0.587], 'Task3': [0.638], 'Task4': [0.693], 'Task5': [0.69], 'Task6': [0.663], 'Task7': [0.796], 'Task8': [0.613], 'Task9': [0.872], 'Task10': [0.901], 'Task11': [0.684]}
AUC_lr = pd.DataFrame(data=info_lr)
AUC_lr


Unnamed: 0,mean,Task1,Task2,Task3,Task4,Task5,Task6,Task7,Task8,Task9,Task10,Task11
0,0.73,0.889,0.587,0.638,0.693,0.69,0.663,0.796,0.613,0.872,0.901,0.684


# get csv-file of proba

In [33]:
compression_opts = dict(method='zip',
                        archive_name='LogReg_proba.csv')  
best_score.to_csv('LogReg_proba.zip', index=True,
          compression=compression_opts)  

# Result

In [34]:
# it seems that LogReg performs better for Tasks 2,3,4,5 and 6
# while RF performs better for Tasks 1,7,8,9,10,11

#so let's combine the corresponding probas to a new solution array

In [35]:
#idx_logreg_better = [2,3,4,5,6]

final_result = second_best_score.copy()
Log_reg_proba = best_score.copy()

In [36]:
final_result[final_result.columns[1]] = Log_reg_proba[Log_reg_proba.columns[1]]
final_result[final_result.columns[2]] = Log_reg_proba[Log_reg_proba.columns[2]]
final_result[final_result.columns[3]] = Log_reg_proba[Log_reg_proba.columns[3]]
final_result[final_result.columns[4]] = Log_reg_proba[Log_reg_proba.columns[4]]
final_result[final_result.columns[5]] = Log_reg_proba[Log_reg_proba.columns[5]]

In [37]:
final_result == second_best_score.copy()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,True,False,False,False,False,False,True,True,True,True,True
1,True,False,False,False,False,False,True,True,True,True,True
2,True,False,False,False,False,False,True,True,True,True,True
3,True,False,False,False,False,False,True,True,True,True,True
4,True,False,False,False,False,False,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...
5891,True,False,False,False,False,False,True,True,True,True,True
5892,True,False,False,False,False,False,True,True,True,True,True
5893,True,False,False,False,False,False,True,True,True,True,True
5894,True,False,False,False,False,False,True,True,True,True,True


In [38]:
#after evaluation of the challenge server better rename final_result:

sad_try = final_result 

#since it's scores arent't as expected
#it's not the best of both worlds 

# get csv file of final proba

In [39]:
compression_opts = dict(method='zip',
                        archive_name='final_proba.csv')  
final_result.to_csv('final_proba.zip', index=True,
          compression=compression_opts)  