In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf

import os
import sys
import datetime

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.model.include_concat import NN
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

In [3]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [4]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

In [5]:
ensemble = NN(train)

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf

import os
import sys
import datetime

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.model.include_concat import NN
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

In [7]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    )
]

In [8]:
def log_(column):
    return np.log(-min(column) + 1 + column)

In [9]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

train[['Skewness', 'Skewness_DMSNR_Curve']] = train.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})
test[['Skewness', 'Skewness_DMSNR_Curve']] = test.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})

In [10]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    )
]

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import wandb

import os
import sys
import datetime

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.model.include_concat import NN
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

In [12]:
ensemble = NN(train)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []
for train_idx, val_idx in k_fold.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    history = ensemble.fit(
        X_train, y_train, 
        batch_size=CFG.BATCH_SIZE, 
        epochs=5, callbacks=callbacks, 
        validation_data=(X_val, y_val)
    )

    train_preds = ensemble.predict(X_train)
    train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_loss)

    oof_preds = ensemble.predict(X_val)
    oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [13]:
X = train.drop('Class', axis=1).values
y = train['Class'].values

In [14]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    ),
    WandbMetricsLogger()
]

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import wandb
from wandb.keras import WandbMetricsLogger

import os
import sys
import datetime

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import log_loss

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.model.include_concat import NN
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 100)

In [16]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [17]:
def log_(column):
    return np.log(-min(column) + 1 + column)

In [18]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

train[['Skewness', 'Skewness_DMSNR_Curve']] = train.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})
test[['Skewness', 'Skewness_DMSNR_Curve']] = test.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})

In [19]:
X = train.drop('Class', axis=1).values
y = train['Class'].values

In [20]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    ),
    WandbMetricsLogger()
]

In [21]:
ensemble = NN(train)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []
for train_idx, val_idx in k_fold.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    history = ensemble.fit(
        X_train, y_train, 
        batch_size=CFG.BATCH_SIZE, 
        epochs=5, callbacks=callbacks, 
        validation_data=(X_val, y_val)
    )

    train_preds = ensemble.predict(X_train)
    train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_loss)

    oof_preds = ensemble.predict(X_val)
    oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [22]:
X = train.drop('Class', axis=1)
y = train['Class']

In [23]:
ensemble = NN(train)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []
for train_idx, val_idx in k_fold.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    history = ensemble.fit(
        X_train, y_train, 
        batch_size=CFG.BATCH_SIZE, 
        epochs=5, callbacks=callbacks, 
        validation_data=(X_val, y_val)
    )

    train_preds = ensemble.predict(X_train)
    train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_loss)

    oof_preds = ensemble.predict(X_val)
    oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [24]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []
for train_idx, val_idx in k_fold.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    history = ensemble.fit(
        X_train, y_train, 
        batch_size=CFG.BATCH_SIZE, 
        epochs=5, callbacks=callbacks, 
        validation_data=(X_val, y_val)
    )

    train_preds = ensemble.predict(X_train)
    train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_loss)

    oof_preds = ensemble.predict(X_val)
    oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [25]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NN', name='NN', config=params)

train_log_loss = []
oof_log_loss = []
models = []
for train_idx, val_idx in k_fold.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    history = ensemble.fit(
        X_train, y_train, 
        batch_size=CFG.BATCH_SIZE, 
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=(X_val, y_val),
        use_multiprocessing=True
    )

    train_preds = ensemble.predict(X_train)
    train_loss = log_loss(y_train, train_preds)
    train_log_loss.append(train_loss)

    oof_preds = ensemble.predict(X_val)
    oof_loss = log_loss(y_val, oof_preds)
    oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [26]:
def make_dataset(X_data,y_data,n_splits):

    def gen():
        for train_index, test_index in KFold(n_splits).split(X_data):
            X_train, X_test = X_data[train_index], X_data[test_index]
            y_train, y_test = y_data[train_index], y_data[test_index]
            yield X_train,y_train,X_test,y_test

    return tf.data.Dataset.from_generator(gen, (tf.float64,tf.float64,tf.float64,tf.float64))

dataset=make_dataset(X,y,10)

In [27]:
dataset

<FlatMapDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.float64, name=None), TensorSpec(shape=<unknown>, dtype=tf.float64, name=None), TensorSpec(shape=<unknown>, dtype=tf.float64, name=None), TensorSpec(shape=<unknown>, dtype=tf.float64, name=None))>

In [28]:
for num, item in enumerate(iter(dataset)):
    print(num, next(item))

In [29]:
from sklearn.model_selection import KFold

In [30]:
for num, item in enumerate(iter(dataset)):
    print(num, next(item))

In [31]:
for num, item in enumerate(iter(dataset)):
    print(num, item)

In [32]:
ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)

In [33]:
ds.shard(5)

In [34]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

In [35]:
from itertools import combinations

arr = [ds1, ds2, ds3, ds4, ds5]
print(list(combinations(arr, 5))

In [36]:
from itertools import combinations

arr = [ds1, ds2, ds3, ds4, ds5]
print(list(combinations(arr, 5)))

In [37]:
from itertools import combinations

arr = ['ds1', 'ds2', 'ds3', 'ds4', 'ds5']
print(list(combinations(arr, 5)))

In [38]:
from itertools import permutations

arr = ['ds1', 'ds2', 'ds3', 'ds4', 'ds5']
print(list(permutations(arr, 5)))

In [39]:
arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val = ds
    dslist = [d for d in arr if d is not ds]
    train = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    it = iter(train)
    x = next(it)
    print(x.shape)
    print(x)
    break

In [40]:
arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val = ds
    dslist = [d for d in arr if d is not ds]
    train = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    it = iter(train)
    x = next(it)
    print(x)
    print(x.shape)
    print(x)
    break

In [41]:
arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val = ds
    dslist = [d for d in arr if d is not ds]
    train = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    print(train)
    it = iter(train)
    x = next(it)
    print(x)
    print(x.shape)
    print(x)
    break

In [42]:
arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val = ds
    dslist = [d for d in arr if d is not ds]
    train = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    it = iter(train)
    x = next(it)
    print(x)

In [43]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = (x, y for x, y in train_ds)

    print(x.to_numpy().shape)
    break

In [44]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = ((x, y) for x, y in train_ds)

    print(x.to_numpy().shape)
    break

In [45]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = ((x, y) for x, y in train_ds.take(-1))

    print(x.to_numpy().shape)
    break

In [46]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = ((x, y) for x, y in train_ds.take(1))

    print(x.to_numpy().shape)
    break

In [47]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = np.concatenate((x, y) for x, y in train_ds)

    print(x.to_numpy().shape)
    break

In [48]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x, y = np.concatenate([(x, y) for x, y in train_ds])

    print(x.to_numpy().shape)
    break

In [49]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds])

    print(x.to_numpy().shape)
    break

In [50]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds])

    print(x.shape)
    break

In [51]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds], axis=0)

    print(x.shape)
    break

In [52]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds.take(-1)], axis=0)

    print(x.shape)
    break

In [53]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds.take(1)], axis=0)

    print(x.shape)
    break

In [54]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds.take(1)])

    print(x.shape)
    break

In [55]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    x = np.concatenate([x for x, y in train_ds])

    print(x.shape)
    break

In [56]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds.batch(64)
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)

x = np.concatenate([x for x, y in train_ds])

print(x.shape)

In [57]:
arr[0]

<ShardDataset element_spec=(TensorSpec(shape=(8,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [58]:
val_ds

<BatchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [59]:
val_ds.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [60]:
val_ds.take(1).numpy()

In [61]:
val_ds.take(1).to_numpy()

In [62]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds.batch(64)
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)

data, labels = tuple(zip(*train_ds))

x = np.array(data)

print(x.shape)

In [63]:
ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)

In [64]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds.batch(64)
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)

data, labels = tuple(zip(*train_ds))

x = np.array(data)

print(x.shape)

In [65]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds.batch(64)
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)

data, labels = tuple(zip(*val_ds))

x = np.array(data)

print(x.shape)

In [66]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)
data, labels = tuple(zip(*val_ds))

x = np.array(data)

print(x.shape)

In [67]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2).batch(128)
data, labels = tuple(zip(*train_ds))

x = np.array(data)

print(x.shape)

In [68]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2)
data, labels = tuple(zip(*train_ds))

x = np.array(data)

print(x.shape)

In [69]:
ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)

In [70]:
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

ds = arr[-1]
val_ds = ds
dslist = [d for d in arr if d is not ds]
train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
train_ds = train_ds.shuffle(len(y)*2)
data, labels = tuple(zip(*train_ds))

x = np.array(data)
y = np.array(labels)

print(x.shape)
print(y.shape)

In [71]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
#wandb.init(project='S3E10', group='NN', name='NN', config=params)

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    # train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    # train_log_loss.append(train_loss)

    # oof_preds = ensemble.predict(X_val)
    # oof_loss = log_loss(y_val, oof_preds)
    # oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [72]:
X = train.drop('Class', axis=1)
y = train['Class']

In [73]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

train[['Skewness', 'Skewness_DMSNR_Curve']] = train.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})
test[['Skewness', 'Skewness_DMSNR_Curve']] = test.apply({'Skewness': log_, 'Skewness_DMSNR_Curve': log_})

In [74]:
X = train.drop('Class', axis=1)
y = train['Class']

In [75]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    ),
    #WandbMetricsLogger()
]

In [76]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
#wandb.init(project='S3E10', group='NN', name='NN', config=params)

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    # train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    # train_log_loss.append(train_loss)

    # oof_preds = ensemble.predict(X_val)
    # oof_loss = log_loss(y_val, oof_preds)
    # oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [77]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=20, mode='min', restore_best_weights=True, verbose=2
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=10, mode='min', restore_best_weights=True, min_lr=1e-12, verbose=2
    ),
    WandbMetricsLogger()
]

In [78]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds #.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    # train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    # train_log_loss.append(train_loss)

    # oof_preds = ensemble.predict(X_val)
    # oof_loss = log_loss(y_val, oof_preds)
    # oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

In [79]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NNtest', name='NNtest', config=params)

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    # train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    # train_log_loss.append(train_loss)

    # oof_preds = ensemble.predict(X_val)
    # oof_loss = log_loss(y_val, oof_preds)
    # oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()

VBox(children=(Label(value='0.043 MB of 0.047 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.911670…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666887461663767, max=1.0)…

In [80]:
ensemble = NN(test)
ensemble.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.LR),
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

params = {
    'folds': CFG.NFOLDS,
    'repeats': CFG.REPEATS,
    'batch_size': CFG.BATCH_SIZE,
    'learning_rate': CFG.LR,
}
wandb.init(project='S3E10', group='NN', name='NN', config=params, tags=['CV5'])

train_log_loss = []
oof_log_loss = []
models = []

ds = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(y)*2)
ds1 = ds.shard(5, 0)
ds2 = ds.shard(5, 1)
ds3 = ds.shard(5, 2)
ds4 = ds.shard(5, 3)
ds5 = ds.shard(5, 4)

arr = [ds1, ds2, ds3, ds4, ds5]

for ds in arr:
    val_ds = ds.batch(64)
    dslist = [d for d in arr if d is not ds]
    train_ds = dslist[0].concatenate(dslist[1]).concatenate(dslist[2]).concatenate(dslist[3])
    train_ds = train_ds.shuffle(len(y)*2).batch(128)

    history = ensemble.fit(
        train_ds,
        epochs=CFG.EPOCHS, callbacks=callbacks, 
        validation_data=val_ds, 
        use_multiprocessing=True
    )

    # train_preds = ensemble.predict(train_ds)
    # train_loss = log_loss(y_train, train_preds)
    # train_log_loss.append(train_loss)

    # oof_preds = ensemble.predict(X_val)
    # oof_loss = log_loss(y_val, oof_preds)
    # oof_log_loss.append(oof_loss)

    models.append(ensemble)
wandb.finish()