In [16]:
import numpy as np
import pandas as pd
from numpy import newaxis
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
from sklearn.model_selection import train_test_split
import gc
from numpy.random import seed
from tensorflow import set_random_seed


In [17]:
root = "../../data/raw/Gamma_Log_Facies_Type_Prediction/"
RANDOM_STATE = 42
seed(RANDOM_STATE)
set_random_seed(RANDOM_STATE)



In [18]:
%%time

N_LAGS = 10

def create_lags(df):
    for i in range(1, N_LAGS+1):
        df[f"-{i}"] = df.GR.shift(i)
        df[f"{i}"] = df.GR.shift(-i)
    return df

def to_categorical(sequences, categories):
    cats = []
    for item in sequences:
        cats.append(np.zeros(categories))
        cats[-1][item] = 1.0
    return np.array(cats)

full_train_df = pd.read_csv(root + "Train_File.csv")
# reduce_mem_usage(full_train_df, use_float16=True);
scaler = StandardScaler()
full_train_df[["GR"]] = scaler.fit_transform(full_train_df[["GR"]])

train_target = full_train_df["label"]

train_data = full_train_df.groupby("well_id").apply(create_lags)
train_data.rename(columns={'GR':'0'}, inplace=True)
train_data = train_data.fillna(0)
train_data = train_data[[str(i) for i in range(-N_LAGS, N_LAGS+1)]]

data_array = np.array(train_data, dtype='float32')
target_array = np.array(train_target, dtype='float32')

train_data_array, valid_data_array, train_target_array, valid_target_array = \
    train_test_split(data_array, target_array, test_size=0.33, random_state=RANDOM_STATE)

train_data_array = train_data_array.reshape(train_data_array.shape[0], train_data_array.shape[1])
valid_data_array = valid_data_array.reshape(valid_data_array.shape[0], valid_data_array.shape[1])

train_target_array = train_target_array.astype(int)
train_target_array = to_categorical(train_target_array, 5)

valid_target_array = valid_target_array.astype(int)
valid_target_array = to_categorical(valid_target_array, 5)

train_data_array = train_data_array[:, :, newaxis]
valid_data_array = valid_data_array[:, :, newaxis]

CPU times: user 1min 9s, sys: 5.67 s, total: 1min 14s
Wall time: 1min 15s


In [19]:
train_data_array.shape, valid_data_array.shape, train_target_array.shape, valid_target_array.shape

((2948000, 21, 1), (1452000, 21, 1), (2948000, 5), (1452000, 5))

In [20]:
%%time

full_test_df = pd.read_csv(root + "Test_File.csv")
submit_df = pd.read_csv(root + "Submission_File.csv")

# reduce_mem_usage(full_test_df, use_float16=True);
full_test_df[["GR"]] = scaler.transform(full_test_df[["GR"]])
test_data = full_test_df.groupby("well_id").apply(create_lags)
test_data.rename(columns={'GR':'0'}, inplace=True)
test_data = test_data.fillna(0)
test_data = test_data[[str(i) for i in range(-N_LAGS, N_LAGS+1)]]
test_data_array = np.array(test_data, dtype='float32')

test_data_array = test_data_array.reshape(test_data_array.shape[0], test_data_array.shape[1])
test_data_array = test_data_array[:, :, newaxis]

CPU times: user 29.1 s, sys: 1.83 s, total: 30.9 s
Wall time: 31.3 s


In [21]:
test_data_array.shape

(2200000, 21, 1)

In [29]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization


In [30]:
n_input = N_LAGS*2 + 1
n_lstm = 128
n_dense_1 = n_lstm * 4
n_dense_2 = n_lstm * 2
n_classifier = 5

model = Sequential()
model.add(Bidirectional(LSTM(n_lstm, input_shape=(n_input, 1), return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100))
model.add(BatchNormalization(scale=False, center=True))
model.add(Activation('relu'))
model.add(Dense(n_classifier, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])


In [31]:
%%time

epochs = 5
batch_size = 256

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint("k10_baseline_best_model.h5", monitor='val_acc', mode='max', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.0001, verbose=1)

history = model.fit(train_data_array, train_target_array, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    callbacks=[es, mc, reduce_lr], 
                    validation_data=(valid_data_array, valid_target_array))

Train on 2948000 samples, validate on 1452000 samples
Epoch 1/5

KeyboardInterrupt: 