In [2]:
import tensorflow as tf
import pandas as pd # Pandas DataFrame
import numpy as np # Numpy array manipulation
from sklearn.decomposition import PCA # Princible Component Analysis
from sklearn.impute import SimpleImputer # Missing value imputation

In [3]:
# Load data from CSV
training_data = pd.read_csv('train_signal.csv')

In [4]:
# Peek at the data
training_data.head()

Unnamed: 0,ID,Type,X0,X1,X2,X3,X4,X5,X6,X7,...,X5990,X5991,X5992,X5993,X5994,X5995,X5996,X5997,X5998,X5999
0,B00000,N,-0.107,-0.1,-0.086,-0.078,-0.071,-0.057,-0.049,-0.035,...,-1.108,-1.072,-1.028,-0.978,-0.912,-0.862,-0.804,-0.724,-0.63,-0.499
1,B00001,N,2.762,3.313,3.863,4.292,4.594,4.623,4.408,3.817,...,-0.107,0.003,0.148,0.241,0.31,0.345,0.368,0.397,0.426,0.438
2,B00002,N,-0.246,-0.2,-0.159,-0.125,-0.101,-0.09,-0.084,-0.078,...,-0.113,-0.038,0.032,0.107,0.165,0.194,0.194,0.159,0.119,0.072
3,B00003,~,0.519,0.778,1.073,1.392,1.672,1.895,2.012,2.023,...,0.037,-0.052,-0.084,-0.099,-0.101,-0.09,-0.067,0.003,0.096,0.179
4,B00004,~,0.011,-0.103,-0.265,-0.371,-0.409,-0.422,-0.418,-0.411,...,0.776,0.829,0.763,0.481,0.126,-0.144,-0.224,-0.25,-0.222,-0.207


In [5]:
# Split into labels (y) and input (X)

# Data from the 3rd feature column onwards are input
X = training_data.values[:,2:]
# Classes/Labels are the type of AF
y = training_data["Type"].values

In [6]:
#print(f"X shape: {X.shape}, y shape: {y.shape}")
print("X shape:  " + str(X.shape))
print("y shape:  " + str(y.shape))

X shape:  (13062, 6000)
y shape:  (13062,)


In [7]:
from keras.utils import to_categorical # Multi-class labels converted to one-hot encoded categories
from sklearn.preprocessing import LabelEncoder # Enumeration for labels

Using TensorFlow backend.


In [8]:
# Convert class labels to integers to then one-hot encode

# Encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# One-hot encode
print(y.shape)
y = to_categorical(encoded_Y)
print(y.shape)

(13062,)
(13062, 4)


In [11]:
# Deal with missing data

# Replace missing values with Nan
X[X == ''] = np.nan

# np.any(np.isnan(X))

# Replace Nan with median
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

  after removing the cwd from sys.path.


In [12]:
# Reshape array to 3D because LSTM requires timesteps (13062, 20, 300)

print(X.shape)
#X3d = X.reshape((13062, 20, 300))
# #X3d = X.reshape(X.shape[0],X.shape[1],1)
# print(X3d.shape)

(13062, 6000)


In [13]:
print(X)

[[-0.107 -0.1   -0.086 ... -0.724 -0.63  -0.499]
 [ 2.762  3.313  3.863 ...  0.397  0.426  0.438]
 [-0.246 -0.2   -0.159 ...  0.159  0.119  0.072]
 ...
 [-0.364 -0.395 -0.434 ... -0.834 -0.632 -0.472]
 [-0.285 -0.475 -0.596 ...  0.77   0.764  0.754]
 [ 0.975  0.935  0.87  ... -0.109 -0.114 -0.119]]


In [11]:
# BROKEN

# X3d_pca = np.empty((13062, 20, 30))

# print(X3d.shape)

# for x in range(0,len(X3d)):
#         pca = PCA(n_components=30) # reduce to 60 dimensions
#         pca.fit(X3d[x])
#         X3d_pca[x] = pca.transform(X3d[x])
        
# X3d_pca.shape()


# # Reduce dimensions of data to make neural networks easier to train, prevent overfitting & remove noise

# pca = PCA(n_components=60) # reduce to 60 dimensions
# pca.fit(X3d)

# print(X3d.shape)
# X3d_pca = pca.transform(X3d)
# print(X3d_pca.shape) 

In [14]:
from keras.preprocessing import sequence # Keras input preprocessing
from keras.models import Sequential # Keras model types
from keras.layers import Dense, Dropout, Activation # Keras model layers
from keras.layers import Embedding, LSTM
from keras.layers import Input, Dense, concatenate, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers import BatchNormalization, GlobalAveragePooling1D, Permute, Dropout
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD, Adam
from keras.models import Model

In [15]:
def lstm_fcn_block():
    ip = Input(shape = (1, 6000))
    
    # Recurrent side
    x = LSTM(8)(ip)
    x = Dropout(0.2)(x)

    # Fully Convolutional Side
    y = Permute((2, 1))(ip)
    y = Conv1D(128, 8, padding = "same", kernel_initializer = "he_uniform")(y)
    y = BatchNormalization()(y)
    y = Activation("relu")(y)

    y = Conv1D(256, 5, padding = "same", kernel_initializer = "he_uniform")(y)
    y = BatchNormalization()(y)
    y = Activation("relu")(y)

    y = Conv1D(128, 3, padding = "same", kernel_initializer = "he_uniform")(y)
    y = BatchNormalization()(y)
    y = Activation("relu")(y)

    y = GlobalAveragePooling1D()(y)

    # Merge both sides back together
    x = concatenate([x, y])
    
    # 4 output classes over softmax
    out = Dense(4, activation = "softmax")(x)

    model = Model(ip, out)

    model.summary()

    return model

In [16]:
X_3d = X.reshape((13062, 1, 6000))
model = lstm_fcn_block()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1, 6000)      0                                            
__________________________________________________________________________________________________
permute_1 (Permute)             (None, 6000, 1)      0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 6000, 128)    1152        permute_1[0][0]                  
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 6000, 128)    512         conv1d_1[0][0]                   
____________________________________________________________________________________________

In [18]:
# Tell model what loss function & optimiser to use
model.compile(loss=categorical_crossentropy,
              optimizer=Adam(lr=0.01),
              metrics=['accuracy',tf.keras.metrics.AUC(),tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

In [None]:
#from IPython.core.debugger import set_trace

# print(type(X3d))
# print(X3d.ndim)
print(X_3d.shape)
# print(X3d[0][0][0])
 
model.fit(X_3d, y, epochs=3, batch_size=32, validation_split=1/6)

(13062, 1, 6000)
Train on 10885 samples, validate on 2177 samples
Epoch 1/3
   96/10885 [..............................] - ETA: 1:28:28 - loss: 1.1386 - accuracy: 0.5312 - auc: 0.7424 - precision: 0.3362 - recall: 0.1753      

In [21]:
# Load data from CSV
test_data = pd.read_csv('test_signal.csv')

In [22]:
# Peek at the data
test_data.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X5990,X5991,X5992,X5993,X5994,X5995,X5996,X5997,X5998,X5999
0,C00000,-0.169,-0.174,-0.184,-0.189,-0.2,-0.21,-0.221,-0.226,-0.226,...,0.375,0.386,0.386,0.36,0.323,0.282,0.24,0.184,0.132,0.09
1,C00001,-0.184,-0.174,-0.169,-0.164,-0.158,-0.158,-0.158,-0.153,-0.153,...,-0.086,-0.117,-0.143,-0.148,-0.153,-0.153,-0.153,-0.153,-0.153,-0.158
2,C00002,1.05,1.622,2.143,2.552,2.653,2.675,2.614,2.249,1.734,...,-1.342,-1.354,-1.365,-1.37,-1.376,-1.37,-1.365,-1.354,-1.342,-1.326
3,C00003,2.259,2.667,3.046,3.376,3.583,3.579,3.334,2.872,2.514,...,-0.119,-0.119,-0.119,-0.116,-0.114,-0.109,-0.104,-0.097,-0.09,-0.083
4,C00004,-0.203,-0.203,-0.201,-0.201,-0.201,-0.199,-0.194,-0.189,-0.185,...,-0.008,0.02,0.039,0.044,0.046,0.046,0.049,0.049,0.051,0.051


In [23]:
# Split into labels (y) and input (X)

# Data from the 3rd feature column onwards are input
test_X = test_data.values[:,1:]
# Classes/Labels are the type of AF
test_ids = test_data["ID"].values

In [24]:

print("X shape:  " + str(test_X.shape))
print("Y shape:  " + str(test_ids.shape))

X shape:  (4000, 6000)
Y shape:  (4000,)


In [25]:
# Deal with missing data

# Replace missing values with Nan
test_X[test_X == ''] = np.nan

# np.any(np.isnan(X))

# Replace Nan with median
imputer = Imputer(missing_values=np.nan, strategy='mean')
test_X = imputer.fit_transform(test_X)

In [28]:
test_X_3d = test_X.reshape((4000, 20, 300))

In [29]:
predictions_ontest = model.predict(test_X_3d)
print(predictions_ontest)

[[6.9736555e-02 6.6803372e-01 2.6197132e-01 2.5837639e-04]
 [3.7679840e-02 4.9299473e-01 4.6864939e-01 6.7600171e-04]
 [8.6994402e-02 4.7240233e-01 3.9491361e-01 4.5689661e-02]
 ...
 [4.4227779e-02 6.8166524e-01 2.7116537e-01 2.9416245e-03]
 [9.0921149e-02 5.5948758e-01 3.4698999e-01 2.6013104e-03]
 [5.5239391e-02 5.5380201e-01 3.8685939e-01 4.0992275e-03]]


In [30]:
# Decodng output
# Softmax -> Integers
integer_predictions = tf.argmax(input=predictions_ontest, axis=1)
# Integers -> Class lables
class_predictions = encoder.inverse_transform(integer_predictions)

  if diff:


In [31]:
# Merge predictions with IDs
preds = list(zip(test_ids, class_predictions))
print(preds)

[('C00000', 'N'), ('C00001', 'N'), ('C00002', 'N'), ('C00003', 'N'), ('C00004', 'N'), ('C00005', 'O'), ('C00006', 'N'), ('C00007', 'N'), ('C00008', 'N'), ('C00009', 'N'), ('C00010', 'N'), ('C00011', 'N'), ('C00012', 'O'), ('C00013', 'N'), ('C00014', 'N'), ('C00015', 'N'), ('C00016', 'O'), ('C00017', 'N'), ('C00018', 'O'), ('C00019', 'N'), ('C00020', 'O'), ('C00021', 'N'), ('C00022', 'O'), ('C00023', 'N'), ('C00024', 'N'), ('C00025', 'N'), ('C00026', 'N'), ('C00027', 'N'), ('C00028', 'N'), ('C00029', 'N'), ('C00030', 'N'), ('C00031', 'N'), ('C00032', 'O'), ('C00033', 'N'), ('C00034', 'N'), ('C00035', 'N'), ('C00036', 'N'), ('C00037', 'N'), ('C00038', 'N'), ('C00039', 'N'), ('C00040', 'N'), ('C00041', 'N'), ('C00042', 'N'), ('C00043', 'N'), ('C00044', 'O'), ('C00045', 'N'), ('C00046', 'O'), ('C00047', 'N'), ('C00048', 'N'), ('C00049', 'N'), ('C00050', 'N'), ('C00051', 'N'), ('C00052', 'N'), ('C00053', 'N'), ('C00054', 'N'), ('C00055', 'N'), ('C00056', 'N'), ('C00057', 'N'), ('C00058', 'N

In [32]:
# As dataframe
predictions = pd.DataFrame(preds, columns=['ID', 'Predicted'])
predictions.head()

Unnamed: 0,ID,Predicted
0,C00000,N
1,C00001,N
2,C00002,N
3,C00003,N
4,C00004,N


In [33]:
# Ensure Sorted by ID
predictions.sort_values(by=['ID'], inplace=True)
print(predictions)

          ID Predicted
0     C00000         N
1     C00001         N
2     C00002         N
3     C00003         N
4     C00004         N
...      ...       ...
3995  C03995         N
3996  C03996         O
3997  C03997         N
3998  C03998         N
3999  C03999         N

[4000 rows x 2 columns]


In [34]:
predictions.to_csv("submission_LSTM_FCN_input_shape_20_300.csv", index=False)