In [24]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score)

from layers import *

In [36]:
folder_data = "./data/"
folder_nano = os.path.join(folder_data, "nano")
folder_nano_numpy = os.path.join(folder_data, "nano_volumes")
filename_train_labels = os.path.join(folder_data, "train_labels.csv")
filename_train_metadata = os.path.join(folder_data, "train_metadata.csv")
filename_test_metadata = os.path.join(folder_data, "test_metadata.csv")
folder_test_numpy = "/media/igorkozlovskii/Elements/Projects/stall_catchers/test_numpy/"


def get_simple_model(input_shape=(32, 32, 32)):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Reshape(input_shape + (1,)),
        ConvBlock_1(32),
        ConvBlock_1(64),
        ConvBlock_2(64),
        ConvBlock_2(128),
        Conv3D_bn(256, kernel_size=2, padding="valid"),
        tf.keras.layers.Reshape((256,)),
        Dense(1, activation=tf.keras.activations.sigmoid),
    ])
    return model

def get_binary(y, threshold=0.5):
    return y >= threshold
    
def get_scores(y_true, y_pred, threshold=0.5):
    y_pred_bin = get_binary(y_pred, threshold=threshold)
    scores = {
        "accuracy": accuracy_score(y_true, y_pred_bin),
        "precision": precision_score(y_true, y_pred_bin),
        "recall": recall_score(y_true, y_pred_bin),
        "f1": f1_score(y_true, y_pred_bin),
        "auc": roc_auc_score(y_true, y_pred),
        "mcc": matthews_corrcoef(y_true, y_pred_bin),
    }
    return scores

def print_scores(*args, **kwargs):
    scores = get_scores(*args, **kwargs)
    for name, value in scores.items():
        print("{:10s} {:.3f}".format(name, value))
    return scores

In [20]:

names = [f.name for f in os.scandir(folder_nano)]
data_train_labels = pd.read_csv(filename_train_labels)
data_train_labels = data_train_labels[
    (data_train_labels.filename.isin(names))]
names = data_train_labels.filename.values.tolist()
labels = data_train_labels.stalled.values

y = labels.copy()
X = []
for name in tqdm(names):
    filename = os.path.join(folder_nano_numpy, name.split(".")[0] + ".npy")
    X.append(np.load(filename))
X = np.array(X, np.float32)

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, y_train.shape, y_train.sum(), y_train.sum() / len(y_train))
print(X_test.shape, y_test.shape, y_test.sum(), y_test.sum() / len(y_test))

HBox(children=(IntProgress(value=0, max=1413), HTML(value='')))


(1130, 32, 32, 32) (1130,) 565 0.5
(283, 32, 32, 32) (283,) 141 0.49823321554770317


In [34]:
model = get_simple_model(X.shape[1:4])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.5,
                              patience=5, min_lr=1e-6,),
    tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
]
model.fit(X_train, y_train, 
    batch_size=16, 
    epochs=40, 
    validation_data=(X_test, y_test),
    callbacks=callbacks,)

Train on 1130 samples, validate on 283 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40


<tensorflow.python.keras.callbacks.History at 0x7f4a86188cc0>

In [35]:
y_train_preds = model.predict(X_train)
y_test_preds = model.predict(X_test)

print(y_train_preds.shape)
print(y_test_preds.shape)

scores_train = print_scores(y_train, y_train_preds)
print()
scores_test = print_scores(y_test, y_test_preds)

(1130, 1)
(283, 1)
accuracy   0.997
precision  0.995
recall     1.000
f1         0.997
auc        1.000
mcc        0.995

accuracy   0.714
precision  0.672
recall     0.830
f1         0.743
auc        0.810
mcc        0.440


In [44]:

data_test_metadata = pd.read_csv(filename_test_metadata)
names_test = data_test_metadata.filename.values.tolist()
y_test_pred_subm = []
for name in tqdm(names_test):
    filename = os.path.join(folder_test_numpy, name.split(".")[0] + ".npy")
    vol = np.load(filename)
    vol = np.array([vol])
    y_test_pred_subm.append(model.predict(vol)[0, 0])
    
y_test_pred_subm = np.array(y_test_pred_subm)

HBox(children=(IntProgress(value=0, max=14160), HTML(value='')))




In [43]:
model.predict(vol)[0, 0]

0.0007028469

In [55]:
y_subm = np.array(y_test_pred_subm >= 0.999, np.int32)
print(np.mean(y_test_pred_subm))
print(np.mean(y_subm), np.sum(y_subm), np.sum(y_subm) / len(y_subm))
data_subm = pd.DataFrame({"filename": names_test, "stalled": y_subm})
data_subm.to_csv(os.path.join("submission_1.csv"), index=False)

0.35464847
0.06574858757062146 931 0.06574858757062146


In [56]:
data_subm.head()

Unnamed: 0,filename,stalled
0,100032.mp4,0
1,100037.mp4,0
2,100139.mp4,1
3,100182.mp4,0
4,100214.mp4,0
