In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss

from sklearn.ensemble import RandomForestClassifier

import catboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

import os
import sys
import datetime

import wandb

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)


In [None]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

In [None]:
def f_eng(df):
    df['Skewness_Power3'] = df['Skewness'].apply(lambda x: x**3)
    df['EK_Power3'] = df['EK'].apply(lambda x: x**3)
    df['cos(EK)'] = df['EK'].apply(lambda x: np.cos(x))
    df['SD_x_EK'] = df['SD'] * df['EK']
    df['cos(EK)_x_SD'] = df['cos(EK)'] * df['SD']
    df['SD_DMSNR_Curve_x_Skewness_Power3'] = df['SD_DMSNR_Curve'] * df['Skewness_Power3']
    df['EK_divide_EK_Power3'] = df['EK'] / df['EK_Power3']
    df['EK_multiply_SD'] = df['EK'] * df['SD']
    df['EK_divide_SD'] = df['EK'] / df['SD']
    df['EK_multiply_SD_DMSNR_Curve'] = df['EK'] * df['SD_DMSNR_Curve']
    df['SD_DMSNR_Curve_divide_SD'] = df['SD_DMSNR_Curve'] / df['SD']
    df['SD_multiply_EK_Power3'] = df['SD'] * df['EK_Power3']
    df['SD_DMSNR_Curve_multiply_SD'] = df['SD_DMSNR_Curve'] * df['SD']
    df['EK_Power3_multiply_SD_DMSNR_Curve'] = df['EK_Power3'] * df['SD_DMSNR_Curve']
    df['cos(EK)_multiply_SD_DMSNR_Curve'] = df['cos(EK)'] * df['SD_DMSNR_Curve']
    return df

In [None]:
train = f_eng(train)
test = f_eng(test)
orig = f_eng(orig)

y = train['Class']
X = train.drop(['Class'], axis=1)

In [None]:
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

In [None]:
def cb_objective(trial):
    PATIENCE = 100
    depth = trial.suggest_int('depth', 2, 7)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1, 100)
    random_strength = trial.suggest_loguniform('random_strength', 1, 100)
    max_bin = trial.suggest_int('max_bin', 2, 255)
    od_wait = trial.suggest_int('od_wait', 10, 100)
    grow_policy = trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
    
    gpu_params = {'task_type' : "GPU", 'devices' : '0'}
    params = {
        'depth': depth,
        'learning_rate': learning_rate,
        'l2_leaf_reg': l2_leaf_reg,
        'random_strength': random_strength,
        'max_bin': max_bin,
        'od_wait': od_wait,
        'grow_policy': grow_policy,
        }
    predsCB = []
    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        model = catboost.CatBoostClassifier(**params, **gpu_params)
        
        model.fit(X=X_train, y=y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds = PATIENCE,
            verbose=False,
            )
        
        predsCB.append(model.predict_proba(X)[:, 1])
    resCB = np.mean(predsCB, axis=0)
    
    return log_loss(y, resCB)

study = optuna.create_study(direction='minimize')

In [None]:
study.optimize(cb_objective, n_trials=50)
CB_params = study.best_params
CB_params

In [None]:
def xb_objective(trial):
    PATIENCE = CFG.XB_PATIENCE
    n_estimators = trial.suggest_int('n_estimators', 100, 10000)
    max_depth = trial.suggest_int('max_depth', 2, 7)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
    subsample = trial.suggest_discrete_uniform('subsample', 0.4, 1.0, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.4, 1.0, 0.1)
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
    
    gpu_params = {'tree_method' : "gpu_hist", 'gpu_id' : 0}
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
    }
    
    xg_params = {
        'n_jobs': -1,
        'objective': 'binary:logistic',
        'verbosity': 0,
        'eval_metric': 'logloss',
        'random_state': CFG.SEED
    }
    
    predsXB = []
    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        model = XGBClassifier(**params, **gpu_params, **xg_params)
        
        model.fit(X=X_train, y=y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds = PATIENCE,
            verbose = False
            )
        
        predsXB.append(model.predict_proba(X)[:, 1])
    resXB = np.mean(predsXB, axis=0)
    
    return log_loss(y_valid, resXB)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(xb_objective, n_trials=50)
xgb_params = study.best_params
xgb_params

In [None]:
def lb_objective(trial):
    PATIENCE = CFG.XB_PATIENCE
    n_estimators = trial.suggest_int('n_estimators', 100, 10000)
    max_depth = trial.suggest_int('max_depth', 2, 7)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    subsample = trial.suggest_discrete_uniform('subsample', 0.4, 1.0, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.4, 1.0, 0.1)
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
    
    # gpu_params = {'tree_method' : "gpu_hist", 'gpu_id' : 0}
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
    }

    lg_params = {
        'objective': 'binary',
        'metric': 'logloss',
        'boosting_type': 'gbdt',
        'random_state': CFG.SEED
    }
    
    k_fold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=CFG.SEED) 
    predsLB = []
    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        model = lgbm.LGBMClassifier(**params, **lg_params)
        
        model.fit(X=X_train, y=y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric = 'logloss',
            early_stopping_rounds = PATIENCE,
            verbose = False
            )
        
        predsLB.append(model.predict_proba(X_valid)[:, 1])
    resLB = np.mean(predsLB, axis=0)
    
    return log_loss(y_valid, resLB)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(lb_objective, n_trials=50)
lb_params = study.best_params
lb_params

In [None]:
def rf(X_tr, y_tr, X_val, y_val, params, r_params):
    cX = cudf.DataFrame.from_pandas(X)
    cy = cudf.Series(y.values)

    
    model = curfc(**params, **r_params)
    
    model.fit(X=X_tr, y=y_tr)
    preds = model.predict_proba(X_val)
    loss = log_loss(y_val, preds)
    return loss

def rf_obj(trial, X_tr, y_tr, X_val, y_val):
    n_bins = trial.suggest_int('n_bins', 8, 256)
    n_estimators = trial.suggest_int('n_estimators', 100, 10000)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)
    max_leaves = trial.suggest_int('max_leaf_nodes', 2, 30)
    min_impurity_decrease = trial.suggest_uniform('min_impurity_decrease', 0.0, 0.5)
    max_samples = trial.suggest_uniform('max_samples', 0.1, 1.0)
    
    params = {
        'n_bins': n_bins,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features,
        'max_leaves': max_leaves,
        'min_impurity_decrease': min_impurity_decrease,
        'max_samples': max_samples,
    }
    r_params = {
        'random_state': CFG.SEED,
    }
    
    preds = rf(X_tr, y_tr, X_val, y_val, params, r_params)
    return preds

def rf_objective(trial, X, y):
    
    losses = []
    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        X_train, X_valid = cudf.DataFrame.from_pandas(X_train), cudf.DataFrame.from_pandas(X_valid)
        y_train, y_valid = cudf.Series(y_train.values), cudf.Series(y_valid.values)
        
        loss = rf_obj(trial, X_train, y_train, X_valid, y_valid)
        losses.append(loss)
    return np.mean(losses)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: rf_objective(trial, X, y), n_trials=2)
rf_params = study.best_params
rf_params