In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import wandb
from wandb.keras import WandbMetricsLogger

import os
import sys
import datetime

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.model.include_concat import NN
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

In [16]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [17]:
def log_(column):
    return np.log(-min(column) + 1 + column)

In [73]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

train[['Skewness', 'Skewness_DMSNR_Curve']] = train.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})
test[['Skewness', 'Skewness_DMSNR_Curve']] = test.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})

In [74]:
X = train.drop('Class', axis=1)
y = train['Class']

In [77]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=5, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    ),
    WandbMetricsLogger()
]

In [70]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2)
data, labels = tuple(zip(*train_ds))

x = np.array(data)
y = np.array(labels)

print(x.shape)
print(y.shape)

(94052, 8)
(94052,)


In [82]:

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NN', name='NN', config=params, tags=['CV5'])

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    ensemble = NN(test)
    ensemble.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_preds)

    oof_preds = ensemble.predict(val_ds)
    # oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_preds)

    models.append(ensemble)
wandb.finish()


VBox(children=(Label(value='0.041 MB of 0.041 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch/binary_accuracy,▁▆▆▆▆▇▇▇▇▇▇███
epoch/epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▃▃▃▃▂▂▂▂▂▂▁▁▁
epoch/lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_binary_accuracy,▂▁▃▃▂▄▄▂▃▄▆▅█▅
epoch/val_loss,▇▇▆▆█▅▄▇▅▅▃▇▁▄

0,1
epoch/binary_accuracy,0.98974
epoch/epoch,13.0
epoch/learning_rate,0.001
epoch/loss,0.03748
epoch/lr,0.001
epoch/val_binary_accuracy,0.98962
epoch/val_loss,0.0383


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669073916758256, max=1.0…

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 24: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 19: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 20/50
Epoch 21/50
Epoch 22/50


0,1
epoch/binary_accuracy,▁▃▆▇▇▇▆█▇▄▇▆▇▇▇▇█▁▆▇▆▇▇▇█▇▄▆▆▇▇▆██▂▅▆▆▆▆
epoch/epoch,▁▂▂▃▄▅▆▇█▁▂▃▄▅▅▆▇▁▂▃▄▅▅▆▇█▂▂▃▄▅▆▇▇▁▂▃▄▅▅
epoch/learning_rate,████▃▃▃▃▃███▃▃▃▁▁███████▃▃█████▃▃▁████▃▁
epoch/loss,█▆▄▂▂▃▃▁▂▅▃▃▂▂▂▂▁█▃▂▃▂▂▃▂▁▅▃▃▃▂▂▂▁▇▄▃▃▃▂
epoch/lr,█████▃▃▃▃███▃▃▃▃▁████████▃█████▃▃▁████▃▃
epoch/val_binary_accuracy,▂▁▆▅▅▆▇▆▇▄▅▅▇█▆▆▇▃▅▅▇▃▇▆▆▆▃▆▄▇▅▆▆▇▃▅▄▆▆▆
epoch/val_loss,▇█▃▄▃▃▂▃▂▇▄▄▂▁▂▃▃▇▃▃▂▆▃▄▃▄▆▅▄▃▃▃▃▃▇▄▅▃▃▃

0,1
epoch/binary_accuracy,0.98975
epoch/epoch,31.0
epoch/learning_rate,0.00025
epoch/loss,0.03533
epoch/lr,0.0005
epoch/val_binary_accuracy,0.99086
epoch/val_loss,0.03335


In [84]:
preds = np.sum([model.predict(test) / 5 for model in models])



In [88]:
preds = sum(preds)

In [89]:
submission = pd.read_csv(os.path.join(CFG.DATA_PATH, 'sample_submission.csv'))
submission['Class'] = preds
submission.to_csv(
    os.path.join(CFG.BASE_PATH, 'submissions', f'NN_{datetime.datetime.strftime(datetime.datetime.now(), "%m-%d_%H-%M")}.csv'),
    index=False)