In [9]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [10]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.14.0


In [11]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning
from statsmodels.graphics import tsaplots
from statsmodels.tsa.stattools import kpss, adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

# Load data

In [12]:
dataset_dir = 'dataset'
# dataset_dir = '/kaggle/input/an2dl-challenge2'

In [13]:
dataset = np.load(f'{dataset_dir}/training_data.npy')
df = pd.DataFrame(dataset)
df.shape

(48000, 2776)

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2766,2767,2768,2769,2770,2771,2772,2773,2774,2775
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.478704,0.412963,0.501852,0.756481,0.744444,0.640741,0.516667,0.602778,0.367593,0.331481
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.628,0.624,0.644,0.688,0.74,0.64,0.536,0.6,0.46,0.396
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.959741,0.949098,0.919297,0.916705,0.901249,0.932531,0.955206,0.960481,0.946506,0.97714
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.411765,0.30703,0.325681,0.361549,0.430416,0.48637,0.450502,0.470588,0.569584,0.571019
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.340909,0.522727,0.568182,0.772727,0.681818,0.704545,0.863636,0.727273,0.818182,0.840909


In [15]:
valid_periods = np.load(f'{dataset_dir}/valid_periods.npy')
valid_periods.shape

(48000, 2)

In [16]:
categories = np.load(f'{dataset_dir}/categories.npy')
categories_df = pd.DataFrame(categories, columns=['category'])
unique_categories = np.unique(categories)

categories.shape

(48000,)

# Inspect

In [18]:
def plot_ts(dataset, i, valid_periods=valid_periods):
    # Plot the time series
    plt.figure(figsize=(17, 6))
    plt.plot(dataset.iloc[i, valid_periods[i][0]:valid_periods[i][1]], label='Data')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()
    
def plot_ts_custom(ts):
    plt.figure(figsize=(17, 6))
    plt.plot(ts, label='Data')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.show()

In [20]:
def inspect_univariate(X, y, ax, telescope=9, pred=None, idx=None):
    if(idx==None):
        idx=np.random.randint(0,len(X))

    ax.plot(np.arange(0, len(X[idx,:])), X[idx,:])
    ax.scatter(np.arange(len(X[idx,:]), len(X[idx,:])+telescope), y[idx,:], color='orange')
    if (pred is not None):
        ax.plot(np.arange(len(X[idx,:]), len(X[idx,:])+telescope), pred[idx,:], color='green')

# Build model

In [21]:
df = df.astype('float32')

In [22]:
def build_sequences(df, window=200, stride=10, telescope=9, valid_periods=valid_periods, mask=False):
    # Sanity check to avoid runtime errors
    assert window % stride == 0

    dataset = []
    labels = []
    for index, row in tqdm(df.iterrows()):
        if (not mask):
            temp = row.values[valid_periods[index][0]:valid_periods[index][1]].astype('float32')
        else:
            temp = row.values.astype('float32')
            
        padding_check = len(temp)%window
        if(padding_check != 0):
            # Compute padding length
            padding_len = window - padding_check
            padding = np.zeros((padding_len), dtype='float32')
            temp = np.concatenate((temp, padding))
            assert len(temp) % window == 0
            
        
        for idx in np.arange(0,len(temp)-window-telescope,stride):
            d = temp[idx:idx+window]
            if (np.all(d == 0.0)):
                continue
            dataset.append(d)
            labels.append(temp[idx+window:idx+window+telescope])
        
    dataset = np.expand_dims(np.array(dataset), axis=2)
    labels = np.expand_dims(np.array(labels), axis=2)
    
    return dataset, labels

In [27]:
def get_df_category(to_select, df=df, categories=categories, drop_info=True):
    df_tmp = df.copy()
    df_tmp['non_zero_counts'] = df_tmp.apply(lambda x: x.astype(bool).sum(), axis=1)
    df_tmp['category'] = categories
    df_tmp = df_tmp[(df_tmp['category'].isin(to_select))]
    if (drop_info):
        df_tmp.drop(columns=['category', 'non_zero_counts'], inplace=True)

    return df_tmp

# Baseline

In [29]:
# Just predicts the mean of the last half of the input
class BaselineModel():
    def __init__(self):
        pass
    def fit():
        pass
    def predict(self, X_test):
        return np.array([[np.mean(X_i[-int(len(X_i)/2):])] * 9 for X_i in X_test.squeeze()])

In [25]:
def residual_block(x, filters, kernel_size=3, strides=1):
    shortcut = x

    x = tfkl.Conv1D(filters, kernel_size, strides=strides, padding='same')(x)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Activation('relu')(x)

    x = tfkl.Conv1D(filters, kernel_size, strides=1, padding='same')(x)
    x = tfkl.BatchNormalization()(x)

    x = tfkl.Add()([x, shortcut])
    x = tfkl.Activation('relu')(x)

    return x

def build_resnet_model(input_shape, output_shape):
    input_layer = tfkl.Input(shape=input_shape)
    mask = tfkl.Masking(mask_value=0.0)(input_layer)

    x = tfkl.Conv1D(64, 7, strides=2, padding='same')(mask)
    x = tfkl.BatchNormalization()(x)
    x = tfkl.Activation('relu')(x)

    x = residual_block(x, 64)

    x = tfkl.Attention()([x, x])

    x = tfkl.LSTM(64, return_sequences=True)(x)

    x = tfkl.GlobalAveragePooling1D()(x)

    x = tfkl.Dense(output_shape[0])(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x, name='resnet_model')
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam())

    return model

In [28]:
df_cat = get_df_category(['F'])

# Run Model

In [30]:
info = {}

model_name='model_final'

df_all = get_df_category(unique_categories, drop_info=False)

telescope=9
stride=20
window=200


X_train_val_raw, X_test_raw = train_test_split(df_all, test_size=0.2, random_state=42, stratify=df_all['category'])
X_train_raw, X_val_raw = train_test_split(X_train_val_raw, test_size=0.2, random_state=42, stratify=X_train_val_raw['category'])

X_train_raw.drop(columns=['category', 'non_zero_counts'], inplace=True)
X_val_raw.drop(columns=['category', 'non_zero_counts'], inplace=True)
X_test_raw.drop(columns=['category', 'non_zero_counts'], inplace=True)

X_train, y_train = build_sequences(X_train_raw, window=window, stride=stride, telescope=telescope, mask=True)
X_val, y_val = build_sequences(X_val_raw, window=window, stride=stride, telescope=telescope, mask=True)
X_test, y_test = build_sequences(X_test_raw, window=window, stride=stride, telescope=telescope, mask=True)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)


input_shape = X_train.shape[1:]
output_shape = y_train.shape[1:]
batch_size = 64
epochs = 200

model = build_resnet_model(input_shape, output_shape)
model.summary()


history = model.fit(
    x = X_train,
    y = y_train,
    batch_size = batch_size,
    validation_data=(X_val, y_val),
    epochs = epochs,
    callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=12, restore_best_weights=True, min_delta=0.0001),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=10, factor=0.1, min_lr=1e-5)
    ],
    verbose=1,
).history

model.save(f'/kaggle/working/{model_name}.h5')

preds = model.predict(X_test)

info[model_name] = {
    'mse': tfk.metrics.mean_squared_error(y_test.flatten(), preds.flatten()).numpy(),
    'mae': tfk.metrics.mean_absolute_error(y_test.flatten(), preds.flatten()).numpy()
}

info

30720it [00:44, 691.52it/s]
7680it [00:11, 690.41it/s]
9600it [00:13, 691.94it/s]


(323597, 200, 1) (323597, 9, 1) (79919, 200, 1) (79919, 9, 1) (101511, 200, 1) (101511, 9, 1)
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200


{'model_final': {'mse': 0.007561363, 'mae': 0.052284684}}