Los Alamos National Laboratory - Earthquake analysis
------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import os
import matplotlib.pyplot as plt

In [3]:
def extract_features(df):
    # container
    data = []
    
    # features from the acoustic data set only, since the segments contain nothing else
    max_acoustic = df['acoustic_data'].max()
    mean_acoustic = df['acoustic_data'].mean()
    
    data.append([mean_acoustic])
    data.append([df['acoustic_data'].std()])
    data.append([max_acoustic])
    data.append([df['acoustic_data'].min()])
    
    #number of `peaks` -> above mean + (max-mean)/2 -> any value above (max + mean)/2
    signal_values = df['acoustic_data'].loc[df['acoustic_data'] > (max_acoustic + mean_acoustic) / 2.]
    
    signal_values = np.array(signal_values)
    
    data.append([signal_values.shape[0]]) # number of peaks
    
    data.append(np.correlate(df['acoustic_data'].values[::1000],
                       df['acoustic_data'].values[::1000], mode='same')) # auto-correlate 0.01 % of the data
                                                                         # to see how self-similair it is
                                             
    acoustic_histo = np.histogram(df['acoustic_data'], bins=75)
    data.append(acoustic_histo[0]) # bins
    data.append(acoustic_histo[0]) # values
    
    data.append(np.abs(np.fft.fft(df['acoustic_data'].values[::1000], n=100)))
    
    # we must flatten out the features
    return [item for sublist in data for item in sublist]

In [None]:
TextFileReader = pd.read_csv('../input/train.csv', chunksize=150000) # the segment files contain 150000 lines each!

reduced_data = dict()
counter = 0

for df in TextFileReader:
    reduced_data[counter] = dict()
    last_time_to_failure = df['time_to_failure'].values[::-1][0]
    reduced_data[counter][last_time_to_failure] = extract_features(df)
    counter += 1
    if counter % 250 == 0: print('%d segments - done.' % counter)

250 segments - done.


In [4]:
TextFileReader = pd.read_csv('../input/train.csv', chunksize=150000, skiprows=25000)

for df in TextFileReader:
    df.columns = ['acoustic_data', 'time_to_failure']
    reduced_data[counter] = dict()
    last_time_to_failure = df['time_to_failure'].values[::-1][0]
    reduced_data[counter][last_time_to_failure] = extract_features(df)
    counter += 1
    if counter % 250 == 0: print('%d segments - done.' % counter)

4250 segments - done.
4500 segments - done.
4750 segments - done.
5000 segments - done.
5250 segments - done.
5500 segments - done.
5750 segments - done.
6000 segments - done.
6250 segments - done.
6500 segments - done.
6750 segments - done.
7000 segments - done.
7250 segments - done.
7500 segments - done.
7750 segments - done.
8000 segments - done.
8250 segments - done.


In [5]:
TextFileReader = pd.read_csv('../input/train.csv', chunksize=150000, skiprows=75000)

for df in TextFileReader:
    df.columns = ['acoustic_data', 'time_to_failure']
    reduced_data[counter] = dict()
    last_time_to_failure = df['time_to_failure'].values[::-1][0]
    reduced_data[counter][last_time_to_failure] = extract_features(df)
    counter += 1
    if counter % 250 == 0: print('%d segments - done.' % counter)

8500 segments - done.
8750 segments - done.
9000 segments - done.
9250 segments - done.
9500 segments - done.
9750 segments - done.
10000 segments - done.
10250 segments - done.
10500 segments - done.
10750 segments - done.
11000 segments - done.
11250 segments - done.
11500 segments - done.
11750 segments - done.
12000 segments - done.
12250 segments - done.
12500 segments - done.


In [6]:
len(reduced_data) # number of segments achieved that we could predict on!

12584

In [7]:
dataframes = []

for index in range(len(reduced_data)):
    df = pd.DataFrame.from_dict(reduced_data[index], orient='index')
    df['_id'] = index
    df['ttf'] = df.index
    df.set_index('_id', inplace=True)
    dataframes.append(df)
    
del reduced_data

In [8]:
for df in dataframes:
    df.to_csv('df_all.csv', mode='a', header=False, index=False)

del dataframes

In [9]:
train = pd.read_csv('df_all.csv', header=None)
os.remove('df_all.csv')

train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405
0,4.884113,5.101106,104,-98,52,2418,2080,2414,2421,2333,2290,2527,2353,2571,2367,2566,2676,2803,2678,2803,2720,2710,2522,2652,2932,2835,2625,2871,2754,3055,2794,2606,2850,3048,2779,3103,2971,2968,2866,2972,...,25.824243,1.172078,54.498654,100.082528,3.098883,58.474525,49.444169,35.328539,9.86188,58.503125,51.387693,27.034476,56.913262,49.402978,67.424031,29.399786,58.821687,34.510315,37.654443,28.723467,40.551197,98.451245,23.363925,49.034801,44.554268,15.026385,35.075801,41.615854,66.657765,41.64594,23.67911,75.068126,38.35139,70.345948,59.71165,20.903091,48.278915,26.806422,27.947093,1.430797
1,4.725767,6.588824,181,-154,57,1926,1230,1743,1446,1966,1532,1571,1184,1575,1638,1579,1582,1960,2153,1172,1860,1546,2033,569,1737,2552,2099,2127,1927,1679,2377,2742,2425,905,1848,2414,2647,3636,1394,2420,...,62.251798,55.121014,80.487485,68.753209,102.270801,39.377366,32.789595,90.819976,59.151005,7.231073,119.469975,16.212216,71.886133,53.076623,75.716577,59.687855,84.287662,75.334106,101.454495,93.267801,36.924865,97.260244,81.160331,84.739603,31.141287,27.21187,28.570385,10.260729,82.36452,57.573532,43.966165,78.012256,34.211017,66.30355,8.966739,81.087787,112.445002,89.884238,71.749796,1.391499
2,4.906393,6.967397,140,-106,70,1837,1300,1278,1854,2029,1634,1574,2130,1044,1674,1529,1717,2708,2250,2654,2666,2591,2091,2144,2767,1687,2179,1705,1106,2166,1536,2447,2919,1909,2253,1304,2760,2269,1306,3035,...,182.179885,79.222474,12.682267,44.301332,123.392466,69.942301,66.560432,144.667422,71.491739,11.522811,96.828884,151.591456,95.520804,36.654338,106.526992,92.351123,39.434631,41.929963,65.8813,46.656405,63.663867,101.420236,75.336714,11.95396,102.028169,61.776775,66.791248,39.818999,6.151481,24.129336,84.688512,143.820461,5.306941,23.562947,133.708454,45.749734,75.841538,71.78566,107.465529,1.353196
3,4.90224,6.922305,197,-199,39,1764,2670,2119,2163,2021,2171,1961,2317,2483,2504,2465,2538,1884,2380,2503,1976,2171,1735,2586,2504,2880,1940,2396,2470,2985,2684,2449,2200,2798,3007,2566,2542,2826,2594,2189,...,88.45065,47.240278,15.548933,64.752265,30.109712,56.102153,66.26884,32.164981,66.618323,50.066631,21.559757,95.048545,45.781747,37.985397,16.643317,27.595172,65.533891,117.572888,77.228152,43.888126,59.538345,101.88622,44.56573,51.13381,44.877118,81.567546,53.255427,36.338806,111.018149,43.339733,3.70693,46.428708,35.605895,93.724257,69.629025,87.167299,81.828609,19.142712,28.609305,1.313798
4,4.90872,7.30111,145,-126,96,2370,2417,2330,2166,2313,1744,2077,2408,2249,2664,2555,2220,2875,2551,2842,3080,2847,2606,2647,2709,2293,2887,2978,2582,3252,3198,2813,2933,3196,2792,3385,3208,2741,3726,2917,...,42.302809,107.792358,13.214902,49.627002,67.170117,55.145281,52.309435,35.065819,80.709461,51.716753,19.518061,86.597707,63.06266,54.275965,40.496913,6.072729,21.413338,52.323718,26.626024,29.311435,30.191035,60.686232,50.215377,20.037605,93.048862,94.177964,52.789749,93.44974,19.616156,98.058031,32.486939,48.506471,88.800194,33.35948,56.825278,96.241728,65.832525,35.405836,66.648238,1.2744


In [10]:
from sklearn.preprocessing import normalize

In [11]:
train = train.dropna()
train.shape

(12581, 406)

In [12]:
X = normalize(train.values[:, :405])
y = train.values[:, 405]

In [13]:
del train

In [14]:
import tensorflow as tf
tf.enable_eager_execution()

In [15]:
X_ = np.reshape(X, (X.shape[0], 1, X.shape[1]))

dataset = tf.data.Dataset.from_tensor_slices((X_, y))
sequences = dataset.batch(1, drop_remainder=True)

In [16]:
for seq, target in sequences.take(1):
    print(seq.shape, target)

Instructions for updating:
Colocations handled automatically by placer.
(1, 1, 405) tf.Tensor([1.43079719], shape=(1,), dtype=float64)


In [17]:
BATCH_SIZE = 6

BUFFER_SIZE = 20000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((6, 1, 405), (6,)), types: (tf.float64, tf.float64)>

In [18]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
        tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [19]:
def build_model(rnn_units, batch_size):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1, 405), batch_size=batch_size),
        tf.keras.layers.Dense(1024, activation='relu'),
        rnn(rnn_units),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dense(1, activation='relu')
    ])
    return model

In [20]:
model = build_model(1024, BATCH_SIZE)

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (6, 1, 1024)              415744    
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (6, 1024)                 6297600   
_________________________________________________________________
dense_1 (Dense)              (6, 256)                  262400    
_________________________________________________________________
dense_2 (Dense)              (6, 1024)                 263168    
_________________________________________________________________
dense_3 (Dense)              (6, 1)                    1025      
Total params: 7,239,937
Trainable params: 7,239,937
Non-trainable params: 0
_________________________________________________________________


In [22]:
def loss(labels, logits):
    return tf.keras.losses.MSE(labels, logits)

In [23]:
model.compile(
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001),
    loss = loss)

In [24]:
EPOCHS = 35

samples_per_epoch = X.shape[0]
steps_per_epoch = samples_per_epoch // BATCH_SIZE

history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch)

Epoch 1/35
Instructions for updating:
Use tf.cast instead.
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35

In [25]:
model_ = build_model(rnn_units=1024, batch_size=1)

weights = model.get_weights()

model_.set_weights(weights)

In [26]:
test_files = os.listdir('../input/test/')

In [27]:
result = dict()
for file in test_files:
    
    df = pd.read_csv('../input/test/' + file)
    
    data = np.array(extract_features(df))

    X_test = normalize(data.reshape(1, -1))
    X_test = X_test.reshape(1, 1, 405)
    prediction = model_.predict(X_test)[0]
    result[file[::-1][4:][::-1]] = prediction

In [28]:
result_df = pd.DataFrame.from_dict(result, orient='index', columns=['time_to_failure'])
result_df.head(n=2)

Unnamed: 0,time_to_failure
seg_37669c,6.079026
seg_5975f4,6.531628


In [29]:
result_df.to_csv('./submission.csv', columns=['time_to_failure'], index_label='seg_id')