# Imports

In [None]:
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from random import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
data_path = '~/../../media/sf_data/'
file_name = 'seti_breakthrough_listen_et'
data_path = data_path + file_name

In [None]:
TARGET = 'target'

In [None]:
def get_train_file_path(image_id):
    return f"{data_path}/train/{image_id[0]}/{image_id}.npy"

def get_test_file_path(image_id):
    return f"{data_path}/test/{image_id[0]}/{image_id}.npy"

In [None]:
train = pd.read_csv(data_path + '/train_labels.csv')
train['file_path'] = train['id'].apply(get_train_file_path)
train['file_path'] = train['file_path'].str.split('~/../..').str[-1]

test = pd.read_csv(data_path + '/sample_submission.csv')
test['file_path'] = test['id'].apply(get_test_file_path)
test['file_path'] = test['file_path'].str.split('~/../..').str[-1]

In [None]:
train_df, validation_df = train_test_split(train, test_size=0.33)

In [None]:
train_examples = []
train_labels = []
validation_examples = []
validation_labels = []

train_size = 10000
validation_size = 1000

for i in train_df.index[:train_size].to_list():
    raw_train = np.load(train_df.loc[i,'file_path'])

    for raw_channel in raw_train:
        train_examples.append(raw_channel.reshape(1,273, 256))
        train_labels.append(train_df.loc[i,'target'].reshape(1))

for i in validation_df.index[:validation_size].to_list():
    raw_valid = np.load(validation_df.loc[i,'file_path'])
    for raw_channel in raw_valid:
        validation_examples.append(raw_channel.reshape(1,273, 256))
        validation_labels.append(validation_df.loc[i,'target'].reshape(1))

In [None]:
train_examples = np.concatenate(train_examples, axis=0 )
train_labels = np.concatenate(train_labels, axis=0)

validation_examples = np.concatenate(validation_examples, axis=0 )
validation_labels = np.concatenate(validation_labels, axis=0)

In [None]:
train_examples.shape

In [None]:
train_labels.shape

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels))
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_examples, validation_labels))

In [None]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100
EPOCHS = 20
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = validation_dataset.batch(BATCH_SIZE)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPool2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC

model = Sequential()
# model.add(Conv2D(filters=32, kernel_size=(4,4), activation='relu'))
model.add(Flatten(input_shape = (273, 256)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC()])
model.summary()

In [None]:
callbacks = EarlyStopping(monitor='val_auc_2',patience=2, restore_best_weights=True,)
model.fit(train_dataset, epochs=EPOCHS, validation_data = validation_dataset, callbacks=[callbacks])

In [None]:
losses = pd.DataFrame(model.history.history)
losses

In [None]:
losses[['auc_2','val_auc_2']].plot()

In [None]:
losses[['loss','val_loss']].plot()

In [None]:
preds = model.predict(validation_dataset).flatten()

In [None]:
preds

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
binary_preds = (preds > 0.5).astype(int)
binary_preds
confusion_matrix(validation_labels, binary_preds)

In [None]:
print(classification_report(validation_labels, binary_preds))