In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold, cross_validate
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold

In [2]:
def generate_dataframe(y_pred, num):
    df = pd.DataFrame(y_pred, columns = ['Predicted'])
    read_file = pd.read_csv('test_II.csv')
    df.insert(0, 'Id', read_file['x'], True)
    df.to_csv(f'submission{num}.csv', index=False)

In [3]:
train_dataset = pd.read_csv('feature_descriptors.csv')
train_dataset.drop('Unnamed: 0', axis=1, inplace=True)
pd.set_option('display.max_rows', None)

train_dataset.fillna(0,inplace=True)
# print(dataset.isnull().sum())

# print(train_dataset.shape)

train_y = train_dataset['Expected']

train_x = train_dataset.drop(['Expected'], axis = 1)
print(train_x.dtypes)
# test data

test_dataset = pd.read_csv('test_feature_desc.csv')
test_dataset.drop('Unnamed: 0', axis=1, inplace=True)

test_dataset.fillna(0, inplace=True)
# print(test_dataset.isnull().sum())

Assay ID                      int64
MaxEStateIndex              float64
MinEStateIndex              float64
MaxAbsEStateIndex           float64
MinAbsEStateIndex           float64
qed                         float64
MolWt                       float64
HeavyAtomMolWt              float64
ExactMolWt                  float64
NumValenceElectrons           int64
NumRadicalElectrons           int64
MaxPartialCharge            float64
MinPartialCharge            float64
MaxAbsPartialCharge         float64
MinAbsPartialCharge         float64
FpDensityMorgan1            float64
FpDensityMorgan2            float64
FpDensityMorgan3            float64
BCUT2D_MWHI                 float64
BCUT2D_MWLOW                float64
BCUT2D_CHGHI                float64
BCUT2D_CHGLO                float64
BCUT2D_LOGPHI               float64
BCUT2D_LOGPLOW              float64
BCUT2D_MRHI                 float64
BCUT2D_MRLOW                float64
BalabanJ                    float64
BertzCT                     

In [4]:
# svm linear
def run_svm():
    svm_classifier = SVC(kernel = 'linear')
    model=svm_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 1)

In [5]:
#naive bayes classifier
def run_naive():
    naive_bayes_classifier = GaussianNB()  
    model=naive_bayes_classifier.fit(train_x, train_y)
    nb_pred_y = model.predict(test_dataset)
    generate_dataframe(nb_pred_y, 2)

In [6]:
# svm poly 
def run_svm_poly():
    svm_poly = SVC(kernel = 'poly')
    model=svm_poly.fit(train_x, train_y)
    svm_poly_pred_y = model.predict(test_dataset)
    generate_dataframe(svm_poly_pred_y, 3)

In [7]:
#  svm rbf 
def run_svm_rbf():
    svm_r = SVC(kernel = 'rbf', gamma=0.000001, C=1)
    model=svm_r.fit(train_x, train_y)
    svm_r_pred_y = model.predict(test_dataset)
    generate_dataframe(svm_r_pred_y, 4)


In [8]:
# knn = 7
def run_knn_7():
    knn_classifier = KNeighborsClassifier(n_neighbors=7)
    model=knn_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 5)

In [9]:
#knn = 70 and fill null values with mean
def run_knn_70():
    knn_classifier = KNeighborsClassifier(n_neighbors=70)
    model=knn_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 6)


In [10]:
#knn = 29 and fill null values with mean
def run_knn_29():
    knn_classifier = KNeighborsClassifier(n_neighbors=29)
    model = knn_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 7)

In [11]:
def run_knn_17():
    knn_classifier = KNeighborsClassifier(n_neighbors=17)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(knn_classifier, train_x, train_y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

    model = knn_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 8)

In [12]:
# extra tree classifier
def extra_tree():
    extra_tree_classifier = ExtraTreesClassifier()
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(extra_tree_classifier, train_x, train_y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    
    model = extra_tree_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 9)

In [13]:
# Gradient boosting - high acc
def gradient_boosting():
    gb = GradientBoostingClassifier(learning_rate=0.5)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(gb, train_x, train_y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    
    model = gb.fit(train_x, train_y)
    print(model.feature_importances_)

    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 10)
    

In [14]:
def extra_tree_200():
    extra_tree_classifier = ExtraTreesClassifier(n_estimators=200)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(extra_tree_classifier, train_x, train_y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    
    model = extra_tree_classifier.fit(train_x, train_y)
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 11)

In [22]:
def xgboost12():
    xgb = XGBClassifier()
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)  
    pred_y = model.predict(test_dataset)
    generate_dataframe(pred_y, 12)

In [24]:
def xgboost13():
    xgb = XGBClassifier()
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)
    
    
    # y_train = le.inverse_transform(y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 13)

In [26]:
def xgboost14():
    xgb = XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    estimator = XGBClassifier(objective='binary:logistic', nthread=4, seed=42)

    parameters = {'max_depth': [8, 10], 'n_estimators': [1000, 1200], 'learning_rate': [0.1,0.2]}

    grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='f1', n_jobs=6, cv=5, verbose=True)

    grid_search.fit(train_x, y_train)

    print(grid_search.best_params_, grid_search.best_score_)

    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 14)

In [4]:
def xgboost15():
    xgb = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    estimator = XGBClassifier(objective='binary:logistic', nthread=4, seed=42)

    parameters = {'max_depth': [8, 10], 'n_estimators': [1000, 1200], 'learning_rate': [0.1,0.2]}

    grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='f1', n_jobs=6, cv=5, verbose=True)

    grid_search.fit(train_x, y_train)cr

    print(grid_search.best_params_, grid_search.best_score_)

    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))

    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 15)

In [9]:
def xgboost16():
    xgb = XGBClassifier()
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    estimator = XGBClassifier(objective='binary:logistic', nthread=4, seed=42)

    parameters = {'n_estimators': [1500, 1200], 'learning_rate': [0.1,0.2], 'max_depth':[8,10]}

    grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='f1', n_jobs=6, cv=5, verbose=True)

    grid_search.fit(train_x, y_train)

    print(grid_search.best_params_, grid_search.best_score_)

    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))

    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 16)

In [34]:
def stacking17():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y) # 0, 1
    level_0_clf = dict()
    level_0_clf['random_forest'] = RandomForestClassifier(n_estimators=500, random_state=3)
    level_0_clf['extra_tree'] = ExtraTreesClassifier(n_estimators=200)
    level_0_clf['gradient_boosting'] = GradientBoostingClassifier(learning_rate=0.5)
#     level_0_clf['xgboost'] = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    level_1_clf = XGBClassifier()

    
    stacking_model = StackingClassifier(estimators=list(level_0_clf.items()), final_estimator=level_1_clf)
                                        
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(stacking_model, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))

    model = stacking_model.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 17)
    

In [9]:
def stacking18():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y) # 0, 1
    level_0_clf = dict()
#     level_0_clf['random_forest'] = RandomForestClassifier(n_estimators=100, random_state=3)
    level_0_clf['extra_tree'] = ExtraTreesClassifier(n_estimators=200)
    level_0_clf['gradient_boosting'] = GradientBoostingClassifier(learning_rate=0.5)
    level_0_clf['xgboost'] = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    level_1_clf = XGBClassifier()

    
    stacking_model = StackingClassifier(estimators=list(level_0_clf.items()), final_estimator=level_1_clf)
                                        
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(stacking_model, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = stacking_model.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 18)
    

In [20]:
def stacking19():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y) # 0, 1
    level_0_clf = dict()
#     level_0_clf['random_forest'] = RandomForestClassifier(n_estimators=100, random_state=3)
    level_0_clf['extra_tree'] = ExtraTreesClassifier(n_estimators=200)
    level_0_clf['gradient_boosting'] = GradientBoostingClassifier(learning_rate=0.5)
    level_0_clf['xgboost'] = XGBClassifier(tree_method='hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    level_1_clf = XGBClassifier()

    
    stacking_model = StackingClassifier(estimators=list(level_0_clf.items()), final_estimator=level_1_clf)
                                        
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(stacking_model, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = stacking_model.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 19)
    

In [64]:
def xgb25():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.1, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=10, colsample_bylevel=0.99
)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 25)

In [70]:
def xgb26():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=10, colsample_bylevel=0.99
    )

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 26)

In [91]:
def xgb27():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10, subsample=0.99,
    colsample_bytree = 0.99, colsample_bylevel=0.99)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 27)

In [95]:
def xgb28():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.001, max_depth=10)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 28)

In [109]:
def xgb29():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, reg_alpha=0.01, max_depth=10, subsample=0.99,
    colsample_bytree = 0.99, colsample_bylevel=0.99)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 29)

In [13]:
def xgboost30():
    xgb = XGBClassifier(tree_method='hist', learning_rate=0.2, max_depth=10, n_estimators=400, objective='binary:logistic', nthread=4, seed=42)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    
    estimator = XGBClassifier(objective='binary:logistic', nthread=4, seed=42)

    parameters = {'max_depth': [8, 10], 'n_estimators': [1000, 1200], 'learning_rate': [0.1,0.2]}

    grid_search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='f1', n_jobs=6, cv=5, verbose=True)

    grid_search.fit(train_x, y_train)

    print(grid_search.best_params_, grid_search.best_score_)

    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 30)

In [19]:
def xgboost31():
    xgb = XGBClassifier(tree_method='hist', learning_rate=0.2, max_depth=10, n_estimators=800, objective='binary:logistic', nthread=4, seed=42)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)


    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 31)

In [23]:
def xgboost32():
    xgb = XGBClassifier(tree_method='hist', learning_rate=0.2, max_depth=10, n_estimators=1500, objective='binary:logistic', nthread=4, seed=42)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)


    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 32)

In [27]:
def xgboost33():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=800, learning_rate=0.2, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=8, colsample_bylevel=0.99
    )

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 33)

In [31]:
def xgb34():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 34)

In [35]:
def xgb35():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=10)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 35)

In [40]:
def xgb36():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.001, max_depth=10)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 36)

In [44]:
def xgb37():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)
    xgb = XGBClassifier(
    tree_method='hist', n_estimators=400, learning_rate=0.1, objective='binary:logistic', gamma=0.01, max_depth=8)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(xgb, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = xgb.fit(train_x, y_train)   
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 37)

In [31]:
def xgboost42():
    xgb = XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42, tree_method='hist', gamma=0.01)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)


    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 42)

In [36]:
def xgboost43():
    xgb = XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000, objective='binary:logistic', nthread=4, seed=42, tree_method='hist', gamma=0.3)
    le = LabelEncoder()
    y_train = le.fit_transform(train_y)


    cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    n_scores = cross_validate(xgb, train_x, y_train, scoring='f1', cv=cv, n_jobs=-1)
    print(np.mean(n_scores['test_score']))



    model = xgb.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 43)

In [40]:
def stacking44():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y) # 0, 1
    level_0_clf = dict()
    level_0_clf['extra_tree'] = ExtraTreesClassifier(n_estimators=200)
    level_0_clf['gradient_boosting'] = GradientBoostingClassifier(learning_rate=0.5)
    level_0_clf['xgboost'] = XGBClassifier(tree_method='hist', learning_rate=0.1, max_depth=8, n_estimators=1200, objective='binary:logistic', nthread=4, seed=42)
    level_1_clf = XGBClassifier(tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.1, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=10, colsample_bylevel=0.99)

    
    stacking_model = StackingClassifier(estimators=list(level_0_clf.items()), final_estimator=level_1_clf)
                                        
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(stacking_model, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = stacking_model.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 44)
    

In [44]:
def stacking45():
    le = LabelEncoder()
    y_train = le.fit_transform(train_y) # 0, 1
    level_0_clf = dict()
    level_0_clf['extra_tree'] = ExtraTreesClassifier(n_estimators=200)
    level_0_clf['gradient_boosting'] = GradientBoostingClassifier(learning_rate=0.5)
    level_0_clf['xgboost'] = XGBClassifier(tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.1, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=10, colsample_bylevel=0.99)
    level_1_clf = XGBClassifier(tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.1, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.99,
    colsample_bytree = 0.99, max_depth=10, colsample_bylevel=0.99)

    
    stacking_model = StackingClassifier(estimators=list(level_0_clf.items()), final_estimator=level_1_clf)
                                        
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    n_scores = cross_validate(stacking_model, train_x, y_train, cv=kfold, scoring='f1', n_jobs=6)
    print(np.mean(n_scores['test_score']))
    
    model = stacking_model.fit(train_x, y_train)
    pred_y = model.predict(test_dataset)
    y_new = le.inverse_transform(pred_y)
    generate_dataframe(y_new, 45)
    

In [16]:
if __name__ == '__main__':
    print("Run the above functions")

Run the above functions
