In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

### Loading the datasets

In [2]:
x = (
pd.read_csv("../Data/offsides&sider_descriptors.csv")
)
y = (
pd.read_csv("../Data/offsides&sider_labels.csv")
)

In [3]:
x.head()

Unnamed: 0,drug_rxnorn_id,CanonicalSMILES,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1000082.0,CS(=O)(=O)N(CC(=O)NC1=CC=C(C=C1)OC2=CC=CC=C2)C...,1.68,1087.179,20.855,15.579,16.396,13.826,8.678,10.527,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000492.0,CC1=CC=C(C=C1)N2C(=NNC2=S)C(C)N3C(=C(C(=N3)C)[...,2.124,998.799,18.301,14.348,15.164,11.807,7.754,8.162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1001.0,C1=CC=C(C=C1)CCN,2.615,157.478,6.527,5.378,5.378,4.432,3.172,3.172,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100213.0,CCC1(C(=O)NC(=NC1=O)S)C2=CC=C(C=C2)C,2.576,536.427,13.336,10.227,11.122,8.502,5.711,6.158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10030.0,C#CCCCCCCCF,2.723,91.392,7.657,6.405,6.405,4.914,3.909,3.909,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0


In [4]:
y.head()

Unnamed: 0,drug_rxnorn_id,CanonicalSMILES,Blood and lymphatic system disorders,Cardiac disorders,"Congenital, familial and genetic disorders",Ear and labyrinth disorders,Endocrine disorders,Eye disorders,Gastrointestinal disorders,General disorders and administration site conditions,...,"Pregnancy, puerperium and perinatal conditions",Product issues,Psychiatric disorders,Renal and urinary disorders,Reproductive system and breast disorders,"Respiratory, thoracic and mediastinal disorders",Skin and subcutaneous tissue disorders,Social circumstances,Surgical and medical procedures,Vascular disorders
0,1000082.0,CS(=O)(=O)N(CC(=O)NC1=CC=C(C=C1)OC2=CC=CC=C2)C...,1,1,0,1,0,1,1,1,...,1,1,1,0,0,1,1,1,1,1
1,1000492.0,CC1=CC=C(C=C1)N2C(=NNC2=S)C(C)N3C(=C(C(=N3)C)[...,1,1,0,0,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
2,1001.0,C1=CC=C(C=C1)CCN,1,1,1,1,1,1,1,1,...,0,1,1,1,0,1,1,1,1,1
3,100213.0,CCC1(C(=O)NC(=NC1=O)S)C2=CC=C(C=C2)C,1,1,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,10030.0,C#CCCCCCCCF,1,1,0,1,0,0,1,1,...,0,1,1,1,0,1,1,0,1,1


### Data pre-processing

In [5]:
# Obsolete classes
obsolete_Classes = [
    'Investigations',
    'Social circumstances',
    'Product issues'
]

In [6]:
# Removing rows with null values, and obsolete classes
nuls = list(x[x.isna().T.any()]['drug_rxnorn_id'])
x = x.loc[~x['drug_rxnorn_id'].isin(nuls)]
y = y.loc[~y['drug_rxnorn_id'].isin(nuls)]
x.drop(['drug_rxnorn_id', 'CanonicalSMILES'], axis=1, inplace=True)
y.drop(['drug_rxnorn_id', 'CanonicalSMILES'], axis=1, inplace=True)
y = y.drop(obsolete_Classes, axis = 1)
x.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [7]:
# Data split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=88)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [8]:
def cv_modeling(X,Y):
    '''
    Both X and Y are pandas DataFrames
    '''
    
    clf = MultiOutputClassifier(XGBClassifier(eval_metric = 'logloss', use_label_encoder=False, random_state = 42))

    numFolds = 5
    folds = KFold(n_splits = numFolds, shuffle = True)
    results = np.zeros(shape=(len(X), len(Y.columns)))
    score = 0.0
    
    for train_index, test_index in folds.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        clf.fit(X_train, y_train)

        results[test_index] = clf.predict(X_test)
        score += f1_score(y_test, results[test_index], average='micro')

    score /= numFolds
    
    return clf, score

In [9]:
model, result = cv_modeling(X_train, y_train)
result

0.8179972675713897

In [10]:
yhat = model.predict(X_test)
report = pd.DataFrame(classification_report(y_test, yhat, output_dict=True, target_names=list(y.columns))).transpose()

In [11]:
report

Unnamed: 0,precision,recall,f1-score,support
Blood and lymphatic system disorders,0.76066,0.911038,0.829085,607.0
Cardiac disorders,0.830931,0.960606,0.891075,660.0
"Congenital, familial and genetic disorders",0.430303,0.269962,0.331776,263.0
Ear and labyrinth disorders,0.580786,0.600451,0.590455,443.0
Endocrine disorders,0.513433,0.457447,0.483826,376.0
Eye disorders,0.680428,0.816514,0.742285,545.0
Gastrointestinal disorders,0.925505,0.991881,0.957544,739.0
General disorders and administration site conditions,0.949686,0.989515,0.969191,763.0
Hepatobiliary disorders,0.701095,0.835821,0.762553,536.0
Immune system disorders,0.798376,0.919003,0.854453,642.0


In [12]:
# Saving the model
import pickle
file_name = "xgb_drugs.pkl"

pickle.dump(model, open(file_name, "wb"))