In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
Train = pd.read_csv('trainDF_rdkit.csv')
Test = pd.read_csv('testDF_rdkit.csv')

In [None]:
y_train = Train.iloc[:,-12:]#.replace(["NaN", 'NA'], np.nan).astype('float32')
X_train = Train.iloc[:,:-12]

y_test = Test.iloc[:,-12:]#.replace(["NaN", 'NA'], np.nan).astype('float32')
X_test = Test.iloc[:,:-12]

In [None]:
for target in y_train.columns:
    print(target)
    iy_train = y_train[target].dropna()
    iy_test = y_test[target].dropna()

    iX_train = X_train.iloc[iy_train.index, :].reset_index(drop=True)
    iX_test = X_test.iloc[iy_test.index, :].reset_index(drop=True)
    
    imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)
    #iX_train = imputer.fit_transform(iX_train)
    #iX_test = imputer.fit_transform(iX_test)
    iX_train = pd.DataFrame(imputer.fit_transform(iX_train), columns = iX_train.columns)
    iX_test = pd.DataFrame(imputer.fit_transform(iX_test), columns = iX_test.columns)
    
    scaler = MinMaxScaler()
    #scaler = StandardScaler()
    #iX_train = scaler.fit_transform(iX_train)
    #iX_test = scaler.transform(iX_test)
    iX_train = pd.DataFrame(scaler.fit_transform(iX_train), columns = iX_train.columns)
    iX_test = pd.DataFrame(scaler.transform(iX_test), columns = iX_test.columns)

    X_train_, X_test_, y_train_, y_test_ = iX_train, iX_test, iy_train, iy_test
    
    
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', LinearSVC()))
    models.append(('DNN', MLPClassifier()))

    # evaluate each model in turn
    results = []
    names = []
    scoring = 'f1'
    for name, model in models:
        kfold = StratifiedKFold(n_splits=5)
        cv_results = cross_val_score(model, X_train_, y_train_, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    # boxplot algorithm comparison
    fig = plt.figure(figsize=(10, 10))
    fig.suptitle('Algorithm Comparison')
    plt.xlabel("Algorithms")
    plt.ylabel("CV Score (f1_macro)")
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()