# <a id='0'>目录</a>

- <a href='#1'>介绍</a>
- <a href='#2'>加载库</a>  
- <a href='#3'>特征工程</a> 

# <a id='1'>介绍</a>  
神经网络预测

# <a id='2'>加载库和数据</a>  

In [1]:
import gc
import os
import time
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
PATH="../../datasets/trainsets/"
os.listdir(PATH) # 输出该目录下的文件

['train.csv']

In [3]:
%%time
train_df = pd.read_csv(PATH+'train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64},nrows = 150000*1000)
# train_df = pd.read_csv(PATH+'train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
print("Train: rows:{} cols:{}".format(train_df.shape[0], train_df.shape[1])) 

Train: rows:150000000 cols:2
Wall time: 27.2 s


In [4]:
pd.options.display.precision = 15  # Set number's precison
pd.options.display.max_rows = 50 
train_df.head(10)  # Output head 10.

Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4690999832
1,6,1.4690999821
2,8,1.469099981
3,5,1.4690999799
4,8,1.4690999788
5,8,1.4690999777
6,9,1.4690999766
7,7,1.4690999755
8,-5,1.4690999744
9,3,1.4690999733


# <a id='3'>特征工程</a>  
从测试集看到，每个cvs文件包含15000行数据，所以特征提取也以15000行为单位。

In [5]:
rows = 150000  # 每组数据集合的数据个数
segments = int(np.floor(train_df.shape[0] / rows)) # 所有数据行数 / 每一个segment的个数
print("Number of segments: ", segments)

Number of segments:  1000


In [6]:
# train_X = pd.DataFrame(index=range(segments), dtype=np.float64)
# train_y = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])
train_X = np.zeros((segments,rows),dtype=np.float32)
train_y = np.zeros([segments,1],dtype=np.float32)
print("lenX = {}\r\nlenY = {}\r\n".format(np.shape(train_X),np.shape(train_y)))

lenX = (1000, 150000)
lenY = (1000, 1)



In [7]:

for segment in tqdm(range(segments)):
    # 提取每个segment的数据，共150000个
    seg = train_df.iloc[segment*rows:segment*rows+rows]
    xc = pd.Series(seg['acoustic_data'].values)
    yc = seg['time_to_failure'].values[-1]  # 最后一个数据的距离地震的时间作为训练样本的时间
    train_y[segment] = yc
    train_X[segment] = xc
    


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1603.85it/s]


In [8]:
print(np.shape(train_X))
print(train_X.dtype)
# print(train_y.head(10))

(1000, 150000)
float32


# <a id='4'>模型</a>  

超参数设置

In [9]:
lr = 0.001
training_iters =100000
batch_size = 8

神经网络参数

In [10]:
n_inputs = rows # 输入层
n_steps = 1
n_hidden_units = 128 # 隐藏层的神经元个数
n_classes = 1 # 输出的数量

要输入数据及权重

In [11]:
x = tf.placeholder(tf.float32,shape = [None,n_steps,n_inputs])
y = tf.placeholder(tf.float32,shape = [None,n_classes])

In [12]:
weights = {
    'in':tf.Variable(tf.random_normal([n_inputs , n_hidden_units])),
    'out':tf.Variable(tf.random_normal([n_hidden_units , n_classes]))
}

In [13]:
biases = {
    'in':tf.Variable(tf.constant(0.1,shape = [n_hidden_units,])),
    'out':tf.Variable(tf.constant(0.1,shape = [n_classes,]))
}

RNN模型

In [45]:
def RNN(X,weights,biases):
    print(np.shape(X))
    X = tf.reshape(X,[-1,n_inputs])
    print(np.shape(X))
    X_in = tf.matmul(X,weights['in'])+biases['in']
    X_in = tf.expand_dims(X_in, axis=1)#     X_in = tf.reshape(X_in,[-1,n_hidden_units])
    print(np.shape(X))
    # LSTM
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden_units,forget_bias = 1.0,
                                            state_is_tuple=True)
    # 初始化为0，lstm单元由两部分组成,(c_state,h_state)
    init_state = lstm_cell.zero_state(batch_size,dtype = tf.float32)
    
    # dynamic_rnn接收张量(batch,steps,inputs)
    outputs,final_state = tf.nn.dynamic_rnn(lstm_cell,X_in,initial_state = init_state,time_major = False)
    
    X_out = tf.matmul(final_state[1],weights['out'])+biases['out']
    print(np.shape(X_out))
    return X_out
    

定义损失函数和优化器

In [15]:
pred = RNN(x,weights,biases)
# cost = tf.metrics.mean_absolute_error(labels = y,predictions = pred)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = pred,labels = y))
train_op = tf.train.AdamOptimizer(lr).minimize(cost)


(?, 1, 150000)
(?, 150000)
(?, 150000)
(2, 1)


定义模型预测结果及准确率计算方法

In [16]:
correct_pred = tf.equal(tf.argmax(pred,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred,tf.float32))

In [17]:
def get_Batch(data, label, batch_size):
    print(data.shape, label.shape)
    input_queue = tf.train.slice_input_producer([data, label], num_epochs=1, shuffle=True, capacity=32 ) 
    x_batch, y_batch = tf.train.batch(input_queue, batch_size=batch_size, num_threads=1, capacity=32, allow_smaller_final_batch=False)
    return x_batch, y_batch

# <a id='5'>训练</a>  

In [18]:
x_batch, y_batch = get_Batch(train_X, train_y, batch_size)
x_batch = tf.expand_dims(x_batch, axis=1)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    epoch = 0
    sess.run(tf.local_variables_initializer())
    # 开启协调器
    coord = tf.train.Coordinator()
    # 使用start_queue_runners 启动队列填充
    threads = tf.train.start_queue_runners(sess, coord)
    try:
        while epoch * batch_size < training_iters:
    #         print("IN")
    #         data,label = sess.run([x_batch, y_batch])
    #         print("IN")
            data, label = sess.run([x_batch, y_batch])

            sess.run(train_op, feed_dict={x: data, y: label})
            train_accuracy = accuracy.eval({x: data, y: label})
            print("Epoch %d, Training accuracy %g" % (epoch, train_accuracy))
            epoch = epoch + 1
    except tf.errors.OutOfRangeError:
        print("----- Train End -----")
    finally:
        # 协调器coord发出所有线程终止信号
        coord.request_stop()
        print('---Programm end---')   
    coord.join(threads)  # 把开启的线程加入主线程，等待threads结束

(1000, 150000) (1000, 1)
Epoch 0, Training accuracy 1
Epoch 1, Training accuracy 1
Epoch 2, Training accuracy 1
Epoch 3, Training accuracy 1
Epoch 4, Training accuracy 1
Epoch 5, Training accuracy 1
Epoch 6, Training accuracy 1
Epoch 7, Training accuracy 1
Epoch 8, Training accuracy 1
Epoch 9, Training accuracy 1
Epoch 10, Training accuracy 1
Epoch 11, Training accuracy 1
Epoch 12, Training accuracy 1
Epoch 13, Training accuracy 1
Epoch 14, Training accuracy 1
Epoch 15, Training accuracy 1
Epoch 16, Training accuracy 1
Epoch 17, Training accuracy 1
Epoch 18, Training accuracy 1
Epoch 19, Training accuracy 1
Epoch 20, Training accuracy 1
Epoch 21, Training accuracy 1
Epoch 22, Training accuracy 1
Epoch 23, Training accuracy 1
Epoch 24, Training accuracy 1
Epoch 25, Training accuracy 1
Epoch 26, Training accuracy 1
Epoch 27, Training accuracy 1
Epoch 28, Training accuracy 1
Epoch 29, Training accuracy 1
Epoch 30, Training accuracy 1
Epoch 31, Training accuracy 1
Epoch 32, Training accura

Epoch 267, Training accuracy 1
Epoch 268, Training accuracy 1
Epoch 269, Training accuracy 1
Epoch 270, Training accuracy 1
Epoch 271, Training accuracy 1
Epoch 272, Training accuracy 1
Epoch 273, Training accuracy 1
Epoch 274, Training accuracy 1
Epoch 275, Training accuracy 1
Epoch 276, Training accuracy 1
Epoch 277, Training accuracy 1
Epoch 278, Training accuracy 1
Epoch 279, Training accuracy 1
Epoch 280, Training accuracy 1
Epoch 281, Training accuracy 1
Epoch 282, Training accuracy 1
Epoch 283, Training accuracy 1
Epoch 284, Training accuracy 1
Epoch 285, Training accuracy 1
Epoch 286, Training accuracy 1
Epoch 287, Training accuracy 1
Epoch 288, Training accuracy 1
Epoch 289, Training accuracy 1
Epoch 290, Training accuracy 1
Epoch 291, Training accuracy 1
Epoch 292, Training accuracy 1
Epoch 293, Training accuracy 1
Epoch 294, Training accuracy 1
Epoch 295, Training accuracy 1
Epoch 296, Training accuracy 1
Epoch 297, Training accuracy 1
Epoch 298, Training accuracy 1
Epoch 29

# <a id='6'>预测</a>  

In [41]:
submission = pd.read_csv('../../sample_submission.csv',index_col='seg_id')
print(np.shape(submission))
test_X = np.zeros((np.shape(submission)[0],rows),dtype = np.float32)
print(np.shape(test_X))
submission.head(5)

(2624, 1)
(2624, 150000)


Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,0
seg_0012b5,0
seg_00184e,0
seg_003339,0
seg_0042cc,0


In [42]:
# iterate over all seg_ids
for i, seg_id in enumerate(tqdm((submission.index))):
    seg = pd.read_csv('F:/Softcodes/Python/LANL_Earthquake/datasets/testsets/' + seg_id + '.csv')
    xc = pd.Series(seg['acoustic_data'].values)
    test_X[i] = xc

100%|██████████████████████████████████████████████████████████████████████████████| 2624/2624 [00:43<00:00, 35.88it/s]


In [43]:
test_X = np.reshape(test_X,[np.shape(test_X)[0],1,rows])
print(test_X[:5])
print(np.shape(test_X))


[[[ 4.  0. -2. ...,  6.  9.  8.]]

 [[ 5.  8.  8. ...,  2.  1.  3.]]

 [[ 8.  2.  3. ..., -1.  2.  5.]]

 [[ 2.  6.  3. ...,  2. -2.  9.]]

 [[ 5.  3.  1. ...,  2.  5.  4.]]]
(2624, 1, 150000)


In [44]:
test_predictions = RNN(test_X[:1000],weights,biases)

(1000, 1, 150000)
(1000, 150000)
(1000, 150000)


ValueError: Variable rnn/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "D:\Softwares\Python\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()
  File "D:\Softwares\Python\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "D:\Softwares\Python\Anaconda\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
