In [1]:
import numpy as np # 导入NumPy
import pandas as pd # 导入Pandas
df_train = pd.read_csv('../input/new-earth/exoTrain.csv') # 导入训练集
df_test = pd.read_csv('../input/new-earth/exoTest.csv') # 导入测试集
print(df_train.head()) # 输入头几行数据
print(df_train.info()) # 输出训练集信息

   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \
0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   
1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   
2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   
3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   
4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   

    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \
0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   
1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   
2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   
3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   
4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   

   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  
0      48.57      92.54      39.32      61.42       5.08  

In [2]:
from sklearn.utils import shuffle # 导入乱序工具
df_train = shuffle(df_train) # 乱序训练集
df_test = shuffle(df_test)  # 乱序测试集

In [3]:
X_train = df_train.iloc[:, 1:].values # 构建特征集（训练）
y_train = df_train.iloc[:, 0].values # 构建标签集（训练）
X_test = df_test.iloc[:, 1:].values # 构建特征集（测试）
y_test = df_test.iloc[:, 0].values # 构建标签集（测试）
y_train = y_train - 1 # 标签转换成惯用的(0，1)分类
y_test = y_test - 1 # 标签转换成惯用的(0，1)分类
print (X_train) # 打印训练集中的特征
print (y_train) # 打印训练集中的标签

[[ 4.97000e+00  5.98000e+00  7.95000e+00 ...  1.64000e+00 -3.01000e+00
   1.77000e+00]
 [ 2.12900e+01  3.01800e+01  3.39500e+01 ... -4.67200e+01 -3.84700e+01
  -2.03200e+01]
 [-1.82180e+02 -1.62970e+02 -1.66720e+02 ...  2.77040e+02  2.56200e+02
   2.40700e+02]
 ...
 [-2.46400e+01 -2.52000e+01 -1.42300e+01 ... -1.26000e+00 -1.74000e+00
   6.22000e+00]
 [ 6.57500e+01  5.99200e+01  5.75100e+01 ...  2.24700e+01  2.45800e+01
   1.95500e+01]
 [ 1.51076e+03  1.33869e+03  1.31544e+03 ...  5.61320e+02  5.89190e+02
   5.44000e+02]]
[0 0 0 ... 0 0 0]


In [4]:
X_train = np.expand_dims(X_train, axis=2) # 张量升阶，以满足序列数据集的要求
X_test = np.expand_dims(X_test, axis=2) # 张量升阶，以满足序列数据集的要求

In [5]:
from keras.models import Sequential # 导入序贯模型
from keras import layers # 导入所有类型的层
from keras.optimizers import Adam # 导入优化器
model = Sequential() # 序贯模型
model.add(layers.Conv1D(32, kernel_size=10, strides=4,
          input_shape=(3197, 1))) # 1D CNN层
model.add(layers.MaxPooling1D(pool_size=4, strides=2)) # 池化层
model.add(layers.GRU(256, return_sequences=True)) # 关键，GRU层够要大
model.add(layers.Flatten()) # 展平
model.add(layers.Dropout(0.5)) # Dropout层
model.add(layers.BatchNormalization()) # 批标准化   
model.add(layers.Dense(1, activation='sigmoid')) # 分类输出层
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01) # 设置优化器
model.compile(optimizer=opt, # 优化器
              loss = 'binary_crossentropy', # 交叉熵
              metrics=['accuracy']) # 准确率

Using TensorFlow backend.


In [6]:
history = model.fit(X_train,y_train, # 训练集
                    validation_split = 0.2, # 部分训练集数据拆分成验证集
                    batch_size = 128, # 批量大小
                    epochs = 4, # 训练轮次
                    shuffle = True) # 乱序

Train on 4069 samples, validate on 1018 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [7]:
from sklearn.metrics import classification_report # 分类报告
from sklearn.metrics import confusion_matrix # 混淆矩阵
y_prob = model.predict(X_test) # 对测试集进行预测
y_pred =  np.where(y_prob > 0.5, 1, 0) #将概率值转换成真值
cm = confusion_matrix(y_pred, y_test)
print('Confusion matrix:\n', cm, '\n')
print(classification_report(y_pred, y_test))

Confusion matrix:
 [[565   5]
 [  0   0]] 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       570
           1       0.00      0.00      0.00         0

    accuracy                           0.99       570
   macro avg       0.50      0.50      0.50       570
weighted avg       1.00      0.99      1.00       570



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# for i in range(len(y_prob)):
#      if y_prob[i] >= 0.5: 
#         y_pred[i] = 1
#      else:
#         y_pred[i] = 0

In [13]:
y_pred =  np.where(y_prob > 0.15, 1, 0) # 进行阈值调整
cm = confusion_matrix(y_pred, y_test) 
print('Confusion matrix:\n', cm, '\n')
print(classification_report(y_pred, y_test))

Confusion matrix:
 [[556   4]
 [  9   1]] 

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       560
           1       0.20      0.10      0.13        10

    accuracy                           0.98       570
   macro avg       0.59      0.55      0.56       570
weighted avg       0.97      0.98      0.97       570



**下面是两个函数式API的构建代码段，请读者自行研究如何使用函数式API建构更灵活的模型。**

In [10]:
from keras import layers # 导入各种层
from keras.models import Model # 导入模型
from keras.optimizers import Adam # 导入Adam优化器
input = layers.Input(shape=(3197, 1)) # Input
# 通过函数式API构建模型
x = layers.Conv1D(32, kernel_size=10, strides=4)(input)
x = layers.MaxPooling1D(pool_size=4, strides=2)(x)
x = layers.GRU(256, return_sequences=True)(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.BatchNormalization()(x)
output = layers.Dense(1, activation='sigmoid')(x) # Output
model = Model(input, output) 
model.summary() # 显示模型的输出
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01) # 设置优化器
model.compile(optimizer=opt, # 优化器
              loss = 'binary_crossentropy', # 交叉熵
              metrics=['accuracy']) # 准确率

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3197, 1)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 797, 32)           352       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 397, 32)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 397, 256)          221952    
_________________________________________________________________
flatten_2 (Flatten)          (None, 101632)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 101632)            0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 101632)            4065

In [11]:
# 构建正向网络
input_1 = layers.Input(shape=(3197, 1))
x = layers.GRU(32, return_sequences=True)(input_1)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
# 构建逆向网络
input_2 = layers.Input(shape=(3197, 1))
y = layers.GRU(32, return_sequences=True)(input_2)
y = layers.Flatten()(y)
y = layers.Dropout(0.5)(y)
# 连接两个网络
z = layers.concatenate([x, y])
output = layers.Dense(1, activation='sigmoid')(z)
model = Model([input_1,input_2], output)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 3197, 1)      0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 3197, 1)      0                                            
__________________________________________________________________________________________________
gru_3 (GRU)                     (None, 3197, 32)     3264        input_2[0][0]                    
__________________________________________________________________________________________________
gru_4 (GRU)                     (None, 3197, 32)     3264        input_3[0][0]                    
____________________________________________________________________________________________