In [2]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem, Descriptors
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold, cross_validate
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from boruta import BorutaPy 

In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2022.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5
[0m

In [3]:
def generate_dataframe(y_pred, num):
    df = pd.DataFrame(y_pred, columns = ['Predicted'])
    read_file = pd.read_csv('/kaggle/input/dataset/test_II.csv')
    df.insert(0, 'Id', read_file['x'], True)
    df.to_csv(f'submission{num}.csv', index=False)

In [4]:
feature_ranks = pd.read_csv('/kaggle/input/feature-selection/feature_rankings.csv')

In [5]:
feature_ranks.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
train_dataset = pd.read_csv('/kaggle/input/train-feature-descriptors/train_feature_desc.csv')
train_dataset.drop('Unnamed: 0', axis=1, inplace=True)
train_dataset.fillna(0,inplace=True)

In [11]:
train_dataset

Unnamed: 0,Assay ID,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Expected
0,1644,9.316200,-1.533785,9.316200,0.150485,0.794714,317.599,306.511,315.982463,100,...,0,0,0,0,0,0,0,0,0,2
1,2451,10.532611,0.333788,10.532611,0.333788,0.516641,156.269,136.109,156.151415,66,...,0,0,0,0,0,0,0,4,0,2
2,1384,2.433032,0.000000,2.433032,0.000000,0.251327,362.086,313.702,361.347528,148,...,0,0,0,0,0,0,0,12,0,2
3,16,10.355080,-0.613825,10.355080,0.282361,0.487998,255.665,245.585,255.052302,90,...,0,0,0,0,0,0,0,0,0,2
4,1856,0.000000,0.000000,0.000000,0.000000,0.237972,149.894,149.894,149.894242,8,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,33,11.460021,-3.868472,11.460021,0.053611,0.712426,230.245,220.165,230.036128,82,...,1,0,0,0,0,0,0,0,0,2
75379,1632,5.928972,-2.841623,5.928972,0.082346,0.720533,313.747,296.611,313.041677,104,...,0,0,0,0,0,0,0,0,0,1
75380,1373,4.975926,0.848333,4.975926,0.848333,0.596343,167.258,162.218,166.986341,50,...,0,0,0,0,1,0,0,0,0,1
75381,2,10.241948,0.324028,10.241948,0.324028,0.519485,128.215,112.087,128.120115,54,...,0,0,0,0,0,0,0,0,0,2


In [7]:
selected_train_df = train_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] == 1) | (feature_ranks['Rankings'] == 2)]['Features']]

In [8]:
pd.set_option('display.max_rows', None)

In [14]:
feature_ranks

Unnamed: 0,Features,Rankings
0,Assay ID,1
1,PEOE_VSA14,1
2,PEOE_VSA3,1
3,SMR_VSA6,1
4,SMR_VSA7,1
5,SlogP_VSA10,1
6,SlogP_VSA5,1
7,VSA_EState10,1
8,VSA_EState3,1
9,VSA_EState6,1


In [9]:
test_dataset = pd.read_csv('/kaggle/input/train-feature-descriptors/test_feature_desc.csv')
test_dataset.drop('Unnamed: 0', axis=1, inplace=True)
test_dataset.fillna(0, inplace=True)

In [10]:
selected_test_df = test_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] == 1) | (feature_ranks['Rankings'] == 2)]['Features']]

In [11]:
selected_train_y = train_dataset.loc[:, 'Expected']

In [None]:
def xgb46():
    le = LabelEncoder()
    selected_train_yt = le.fit_transform(selected_train_y)
    xgb1 = XGBClassifier(tree_method='hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1],
                              verbose=True
                             )
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(voting_clf, selected_train_df, selected_train_y, cv=kfold, scoring='f1', n_jobs=-1)
    print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(selected_train_df, selected_train_y)
    test_pred_y = model.predict(selected_test_df)
    generate_dataframe(test_pred_y, 46)

In [29]:
def xgb47():
    le = LabelEncoder()
    selected_train_yt = le.fit_transform(selected_train_y)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(voting_clf, selected_train_df, selected_train_y, cv=kfold, scoring='f1', n_jobs=-1)
    print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(selected_train_df, selected_train_yt)
    test_pred_y = model.predict(selected_test_df)
    test_new_y = le.inverse_transform(test_pred_y)
    generate_dataframe(test_new_y, 47)

In [12]:
#taking 109 features, choosing till rank=50
train_df2 = train_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] <= 50)]['Features']]

In [64]:
train_df2.head()

Unnamed: 0,Assay ID,PEOE_VSA14,PEOE_VSA3,SMR_VSA6,SMR_VSA7,SlogP_VSA10,SlogP_VSA5,VSA_EState10,VSA_EState3,VSA_EState6,...,fr_Al_OH,fr_imide,NumAromaticHeterocycles,FractionCSP3,fr_SH,NumAliphaticRings,SMR_VSA4,NumSaturatedRings,fr_piperdine,PEOE_VSA7
0,1644,0.0,0.0,0.0,59.65784,0.0,17.044809,18.161033,18.6324,12.975791,...,0,0,0,0.142857,0,0,0.0,0,0,35.392371
1,2451,0.0,0.0,0.0,0.0,0.0,58.793226,0.0,0.0,0.0,...,0,0,0,0.9,0,0,0.0,0,0,13.344559
2,1384,0.0,0.0,27.184857,0.0,0.0,116.58062,0.0,0.0,0.0,...,0,0,0,1.0,0,0,0.0,0,0,25.683286
3,16,0.0,20.090702,13.089513,39.160457,0.0,5.563451,5.675726,10.164847,3.516012,...,0,0,1,0.333333,0,1,4.992405,0,0,11.629819
4,1856,29.557286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0,0.0,0,0,0.0


In [15]:
test_df2 = test_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] <= 50)]['Features']]

In [67]:
test_df2.head()

Unnamed: 0,Assay ID,PEOE_VSA14,PEOE_VSA3,SMR_VSA6,SMR_VSA7,SlogP_VSA10,SlogP_VSA5,VSA_EState10,VSA_EState3,VSA_EState6,...,fr_Al_OH,fr_imide,NumAromaticHeterocycles,FractionCSP3,fr_SH,NumAliphaticRings,SMR_VSA4,NumSaturatedRings,fr_piperdine,PEOE_VSA7
0,1682,0.0,0.0,0.0,29.326004,0.0,31.898115,0.0,9.626968,5.824954,...,0,0,0,0.454545,0,0,0.0,0,0,29.531998
1,1656,6.031115,22.918408,25.289237,24.395945,10.742876,6.923737,0.0,1.305518,2.474888,...,0,0,2,0.285714,0,0,0.0,0,0,12.132734
2,36,0.0,26.9047,26.681941,94.272053,8.78083,72.733437,6.27236,-0.146035,18.366259,...,0,0,1,0.444444,0,3,0.0,3,2,112.838561
3,1850,51.384535,8.417797,0.0,16.55728,0.0,6.923737,0.0,0.0,0.0,...,0,0,0,0.25,0,1,0.0,0,0,6.923737
4,30,11.938611,0.0,0.0,23.801165,0.0,73.143616,0.0,9.917687,0.0,...,1,0,0,0.76,0,3,29.086615,1,0,62.857584


In [16]:
train_y_df2 = train_dataset.loc[:, 'Expected']

In [72]:
train_y_df2.head()

0    2
1    2
2    2
3    2
4    2
Name: Expected, dtype: int64

In [75]:
def xgb48():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 48)

In [79]:
def xgb49():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 49)

[Voting] .................... (2 of 10) Processing xgb2, total= 1.6min
[Voting] .................... (1 of 10) Processing xgb1, total= 2.3min
[Voting] .................... (3 of 10) Processing xgb3, total= 1.1min
[Voting] .................... (4 of 10) Processing xgb4, total= 1.0min
[Voting] .................... (5 of 10) Processing xgb5, total= 1.5min
[Voting] .................... (6 of 10) Processing xgb6, total= 1.1min
[Voting] .................... (7 of 10) Processing xgb7, total= 1.2min
[Voting] .................... (8 of 10) Processing xgb8, total=  59.4s
[Voting] .................... (9 of 10) Processing xgb9, total=  40.7s
[Voting] .................. (10 of 10) Processing xgb10, total= 1.0min
[Voting] .................... (2 of 10) Processing xgb2, total= 1.5min
[Voting] .................... (1 of 10) Processing xgb1, total= 2.2min
[Voting] .................... (3 of 10) Processing xgb3, total= 1.0min
[Voting] .................... (4 of 10) Processing xgb4, total= 1.1min
[Votin

In [19]:
def xgb50():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=650, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 50)

In [25]:
def xgb51():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.01, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10, reg_alpha=0.001)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10, reg_alpha=0.002)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=650, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 51)

In [20]:
def xgb52():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.01, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10, reg_alpha=0.001)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10, reg_alpha=0.002)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb6 = XGBClassifier(tree_method='gpu_hist',n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb7 = XGBClassifier(tree_method='gpu_hist', n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 52)

In [24]:
def xgb53():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.01, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10, reg_alpha=0.001)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10, reg_alpha=0.002)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=450, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 53)

In [28]:
def xgb54():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.01, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10, reg_alpha=0.001)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10, reg_alpha=0.002)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=450, objective='binary:logistic', gamma=0.01, reg_alpha=0.002)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=750, learning_rate=0.1, objective='binary:logistic', gamma=0.02, reg_alpha=0.001, max_depth=8)
    xgb12 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=850, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb13 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=950, learning_rate=0.2, objective='binary:logistic', gamma=0.03, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11), ('xgb12', xgb12), ('xgb13', xgb13)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 54)

In [20]:
# taking 95 features till rank = 34
train_df3 = train_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] <= 34)]['Features']]

In [18]:
train_df3.head()

Unnamed: 0,Assay ID,PEOE_VSA14,PEOE_VSA3,SMR_VSA6,SMR_VSA7,SlogP_VSA10,SlogP_VSA5,VSA_EState10,VSA_EState3,VSA_EState6,...,Chi4n,fr_sulfonamd,SMR_VSA10,fr_Ar_N,PEOE_VSA2,FpDensityMorgan2,BCUT2D_LOGPLOW,Chi3v,fr_allylic_oxid,BCUT2D_MWHI
0,1644,0.0,0.0,0.0,59.65784,0.0,17.044809,18.161033,18.6324,12.975791,...,1.903958,0,34.80282,0,0.0,1.0,-2.17892,3.434823,0,35.582517
1,2451,0.0,0.0,0.0,0.0,0.0,58.793226,0.0,0.0,0.0,...,1.154276,0,5.783245,0,0.0,1.545455,-1.88443,1.882392,0,16.137172
2,1384,0.0,0.0,27.184857,0.0,0.0,116.58062,0.0,0.0,0.0,...,3.459239,0,0.0,0,0.0,0.666667,-2.996005,5.23399,0,35.453001
3,16,0.0,20.090702,13.089513,39.160457,0.0,5.563451,5.675726,10.164847,3.516012,...,1.544532,0,17.560494,1,0.0,2.411765,-2.403506,2.507966,0,35.495696
4,1856,29.557286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,1.0,0.0,0.0,0,0.0


In [21]:
test_df3 = test_dataset.loc[:, feature_ranks[(feature_ranks['Rankings'] <= 34)]['Features']]

In [32]:
test_df3.shape

(10994, 95)

In [22]:
train_y_df3 = train_dataset.loc[:, 'Expected']

In [37]:
def xgb55():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df3)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df3, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df3, train_yt)
    test_pred_y3 = model.predict(test_df3)
    test_new_y = le.inverse_transform(test_pred_y3)
    generate_dataframe(test_new_y, 55)

In [19]:
def xgb56():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=500, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=400, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=500, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=650, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11)
                              ],
                              voting='soft',
                              weights=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 56)

In [23]:
def xgb57():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic')
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic')
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=8)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=8)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1000, objective='binary:logistic')
    xgb6 = XGBClassifier(tree_method='gpu_hist',n_estimators=1000, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist',n_estimators=1200, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 57)

In [30]:
def xgb58():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.03, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.3, reg_alpha=0.01)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, reg_alpha=0.001, max_depth=8)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1000, objective='binary:logistic', reg_alpha=0.001, gamma=0.001)
    xgb6 = XGBClassifier(tree_method='gpu_hist',n_estimators=1000, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist',n_estimators=1200, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist',n_estimators=1200, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 58)

In [41]:
def xgb59():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df3)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', gamma=0.03, reg_alpha=0.001)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', gamma=0.3, reg_alpha=0.01)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, reg_alpha=0.001, max_depth=8)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1000, objective='binary:logistic', reg_alpha=0.001, gamma=0.001)
    xgb6 = XGBClassifier(tree_method='gpu_hist',n_estimators=1000, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist',n_estimators=1200, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist',n_estimators=1200, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df3, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df3, train_yt)
    test_pred_y3 = model.predict(test_df3)
    test_new_y = le.inverse_transform(test_pred_y3)
    generate_dataframe(test_new_y, 59)

In [38]:
#taking 70 columns
train_df4 = train_dataset.loc[:, feature_ranks[(feature_ranks['Rankings']<=6)]['Features']]

In [35]:
train_df4.head()

Unnamed: 0,Assay ID,PEOE_VSA14,PEOE_VSA3,SMR_VSA6,SMR_VSA7,SlogP_VSA10,SlogP_VSA5,VSA_EState10,VSA_EState3,VSA_EState6,...,fr_para_hydroxylation,PEOE_VSA11,BCUT2D_MWLOW,fr_pyridine,SMR_VSA2,fr_Al_COO,fr_imidazole,Chi1,fr_NH1,MinEStateIndex
0,1644,0.0,0.0,0.0,59.65784,0.0,17.044809,18.161033,18.6324,12.975791,...,0,0.0,9.922283,0,0.0,0,0,8.876029,0,-1.533785
1,2451,0.0,0.0,0.0,0.0,0.0,58.793226,0.0,0.0,0.0,...,0,0.0,10.087958,0,0.0,0,0,5.270056,0,0.333788
2,1384,0.0,0.0,27.184857,0.0,0.0,116.58062,0.0,0.0,0.0,...,0,0.0,10.070986,0,0.0,0,0,11.12132,0,0.0
3,16,0.0,20.090702,13.089513,39.160457,0.0,5.563451,5.675726,10.164847,3.516012,...,0,5.032314,10.251808,1,0.0,0,0,8.147867,1,-0.613825
4,1856,29.557286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0


In [39]:
test_df4 = test_dataset.loc[:, feature_ranks[(feature_ranks['Rankings']<=6)]['Features']]

In [37]:
test_df4.head()

Unnamed: 0,Assay ID,PEOE_VSA14,PEOE_VSA3,SMR_VSA6,SMR_VSA7,SlogP_VSA10,SlogP_VSA5,VSA_EState10,VSA_EState3,VSA_EState6,...,fr_para_hydroxylation,PEOE_VSA11,BCUT2D_MWLOW,fr_pyridine,SMR_VSA2,fr_Al_COO,fr_imidazole,Chi1,fr_NH1,MinEStateIndex
0,1682,0.0,0.0,0.0,29.326004,0.0,31.898115,0.0,9.626968,5.824954,...,0,0.0,9.8578,0,0.0,0,0,5.409702,0,0.025579
1,1656,6.031115,22.918408,25.289237,24.395945,10.742876,6.923737,0.0,1.305518,2.474888,...,0,14.863213,10.501054,1,0.0,0,0,13.123689,2,-4.605249
2,36,0.0,26.9047,26.681941,94.272053,8.78083,72.733437,6.27236,-0.146035,18.366259,...,2,0.0,9.697623,0,0.0,0,1,22.991969,1,-4.140552
3,1850,51.384535,8.417797,0.0,16.55728,0.0,6.923737,0.0,0.0,0.0,...,0,0.0,0.0,0,0.0,0,0,4.4948,0,-3.973958
4,30,11.938611,0.0,0.0,23.801165,0.0,73.143616,0.0,9.917687,0.0,...,0,0.0,9.649315,0,0.0,0,0,14.101393,0,-0.601027


In [41]:
train_y_df4 = train_dataset.loc[:, 'Expected']

In [42]:
train_y_df4.head()

0    2
1    2
2    2
3    2
4    2
Name: Expected, dtype: int64

In [47]:
def voting60():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df4)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=24, scale_pos_weight=1, min_child_weight=1)
    xgb3 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=15, gamma=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, min_child_weight=1)
    xgb4 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=15, gamma=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, min_child_weight=1)
    xgb6 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=46, scale_pos_weight=1, min_child_weight=1)
    xgb7 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=400, objective='binary:logistic', nthread=4, seed=15, gamma=1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, min_child_weight=1)
    xgb8 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=450, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8)
                              ],
                              voting='soft',
                              weights=[1, 1, 2, 2, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df4, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df4, train_yt)
    test_pred_y4 = model.predict(test_df4)
    test_new_y = le.inverse_transform(test_pred_y4)
    generate_dataframe(test_new_y, 60)
    
    
    
    

In [52]:
def voting61():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=650, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 61)

In [56]:
def voting62():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df2)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10, scale_pos_weight=1, min_child_weight=1)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10, scale_pos_weight=1, min_child_weight=1)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42, scale_pos_weight=1, min_child_weight=1)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10, scale_pos_weight=1, min_child_weight=1)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10, scale_pos_weight=1, min_child_weight=1)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8, scale_pos_weight=1, min_child_weight=1)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8, scale_pos_weight=1, min_child_weight=1)
   
    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df2, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df2, train_yt)
    test_pred_y2 = model.predict(test_df2)
    test_new_y = le.inverse_transform(test_pred_y2)
    generate_dataframe(test_new_y, 62)

In [28]:
train_df5 = train_dataset.loc[:, feature_ranks[(feature_ranks['Rankings']<=101)]['Features']]

In [29]:
test_df5 = test_dataset.loc[:, feature_ranks[(feature_ranks['Rankings']<=101)]['Features']]

In [30]:
train_y_df5 = train_dataset.loc[:, 'Expected']

In [40]:
def voting63():
    le = LabelEncoder()
    train_yt = le.fit_transform(train_y_df5)
    xgb1 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    xgb2 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb3 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)
    xgb4 = XGBClassifier(tree_method='gpu_hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)
    xgb5 = XGBClassifier(tree_method='gpu_hist', learning_rate=0.2, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    xgb6 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb7 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=450, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb8 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=1000, learning_rate=0.2, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb9 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=500, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)
    xgb10 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=550, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)
    xgb11 = XGBClassifier(tree_method='gpu_hist', max_bin=255, n_estimators=650, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=8)

    voting_clf = VotingClassifier(n_jobs=-1, 
                              estimators=[
                                  ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4), ('xgb5', xgb5), ('xgb6', xgb6), ('xgb7', xgb7), ('xgb8', xgb8), ('xgb9', xgb9), ('xgb10', xgb10), ('xgb11', xgb11)
                              ],
                              voting='soft',
                              weights=[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                              verbose=True
                             )
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
#     n_scores = cross_validate(voting_clf, train_df5, train_yt, cv=kfold, scoring='f1', n_jobs=-1)
#     print(np.mean(n_scores['test_score']))
    
    model = voting_clf.fit(train_df5, train_yt)
    test_pred_y5 = model.predict(test_df5)
    test_new_y = le.inverse_transform(test_pred_y5)
    generate_dataframe(test_new_y, 63)
    

[Voting] .................... (1 of 11) Processing xgb1, total= 1.5min
[Voting] .................... (4 of 11) Processing xgb4, total=  40.5s
[Voting] .................... (6 of 11) Processing xgb6, total=  40.8s
[Voting] .................... (8 of 11) Processing xgb8, total=  39.7s
[Voting] .................. (10 of 11) Processing xgb10, total=  52.0s
[Voting] .................... (2 of 11) Processing xgb2, total= 1.1min
[Voting] .................... (3 of 11) Processing xgb3, total=  40.6s
[Voting] .................... (5 of 11) Processing xgb5, total=  55.3s
[Voting] .................... (7 of 11) Processing xgb7, total=  44.4s
[Voting] .................... (9 of 11) Processing xgb9, total=  26.1s
[Voting] .................. (11 of 11) Processing xgb11, total=  33.6s


In [39]:
if __name__ == '__main__':
    print("Run the above functions")
#     voting63()