# North Korea Missile Test

## 0. Importing libraries

In [1]:
import math
import matplotlib
matplotlib.use('Agg')
import pandas as pd
# import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn.datasets as datasets
import csv
import lightgbm as lgb

from keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder

Using TensorFlow backend.


## 1. Loading the data

In [2]:
df = pd.read_excel('./north_korea_missile_test_database.xlsx')
successCount=0
for indexDF, rowDF in df.iterrows():
    if (df.at[indexDF, 'Test Outcome'] =='Success'):
        successCount = successCount + 1
print ('Number of samples: ', df.shape[0])
print ('Number of features: ', df.shape[1])
print ('Number of successful launches: ', successCount)
df.keys()

Number of samples:  117
Number of features:  19
Number of successful launches:  88


Index(['F1', 'Date', 'Date Entered/Updated', 'Launch Time (UTC)',
       'Missile Name', 'Missile Type', 'Launch Agency/Authority',
       'Facility Name', 'Facility Location', 'Other Name', 'Facility Latitude',
       'Facility Longitude', 'Landing Location', 'Apogee',
       'Distance Travelled', 'Confirmation Status', 'Test Outcome',
       'Additional Information', 'Source(s)'],
      dtype='object')

## 2. Displaying the header

In [3]:
df.head(5)

Unnamed: 0,F1,Date,Date Entered/Updated,Launch Time (UTC),Missile Name,Missile Type,Launch Agency/Authority,Facility Name,Facility Location,Other Name,Facility Latitude,Facility Longitude,Landing Location,Apogee,Distance Travelled,Confirmation Status,Test Outcome,Additional Information,Source(s)
0,1,1984-04-09,2016-12-23,,Scud-B,SRBM,,Tonghae Satellite Launching Ground,"Hwadae County, North Hamgyong Province",Musudan-ri,40.85,129.667,Unknown,Unknown,Unknown,Confirmed,Success,First known test by North Korea of a missile w...,http://www.nti.org/media/pdfs/north_korea_miss...
1,2,1984-04-09,2016-12-23,,Scud-B,SRBM,,Tonghae Satellite Launching Ground,"Hwadae County, North Hamgyong Province",Musudan-ri,40.85,129.667,Unknown,Unknown,Unknown,Confirmed,Success,,http://www.nti.org/media/pdfs/north_korea_miss...
2,3,1984-04-09,2016-12-23,,Scud-B,SRBM,,Tonghae Satellite Launching Ground,"Hwadae County, North Hamgyong Province",Musudan-ri,40.85,129.667,Unknown,200 km,Unknown,Confirmed,Success,,http://www.astronautix.com/g/gitdaeryung.html;...
3,4,1984-09-01,2016-12-23,,Scud-B,SRBM,,Tonghae Satellite Launching Ground,"Hwadae County, North Hamgyong Province",Musudan-ri,40.85,129.667,Unknown,Unknown,Unknown,Confirmed,Failure,,"Joseph S. Bermudez, ""A History of Ballistic Mi..."
4,5,1984-09-01,2016-12-23,,Scud-B,SRBM,,Tonghae Satellite Launching Ground,"Hwadae County, North Hamgyong Province",Musudan-ri,40.85,129.667,Unknown,200 km,Unknown,Confirmed,Failure,,http://www.astronautix.com/g/gitdaeryung.html;...


## 3. Organizing the features

In [4]:
unusable_features = ['F1', 
                    'Date Entered/Updated', 
                    'Missile Name',
                    'Launch Agency/Authority',
                    'Confirmation Status',
                    'Additional Information',
                    'Apogee',
                    'Distance Travelled', 
                    'Landing Location',
                    'Source(s)'
                    ]

categorical_features = ['Missile Type',
                         'Facility Name',
                         'Facility Location',
                         'Other Name',        
                        ]

features_missing_values = ['Launch Time (UTC)',
                           'Facility Longitude',
                           'Facility Latitude'
                          ]

features_missing_str = ['Other Name']


time_shaped_features = ['Date',
                        'Launch Time (UTC)'
                       ]


## 4. Cleaning the bad features

In [5]:
for bad_feature in unusable_features:
    df = df.drop(bad_feature, axis = 1)
    
for indexDF, rowDF in df.iterrows():
    if (df.at[indexDF, 'Test Outcome'] == 'Unknown'):
        df = df.drop(indexDF, axis = 0)

In [6]:
for bad_feature in features_missing_str:
    for indexDF, rowDF in df.iterrows():
        if (pd.isnull(df.at[indexDF, bad_feature])):
            df.at[indexDF,bad_feature] = 'Other'

In [7]:
for indexDF, rowDF in df.iterrows():
    if(df.at[indexDF, 'Test Outcome'] == 'Success'):
        df.at[indexDF,'Test Outcome'] = True
    else:
        df.at[indexDF,'Test Outcome'] = False

In [8]:
import datetime
import time

df['Date'] = pd.to_timedelta(df['Date'], errors='coerce')
df['Launch Time (UTC)'] = pd.to_timedelta(df['Launch Time (UTC)'].astype(str), errors='coerce')
df['Launch Time (UTC)'] = df['Launch Time (UTC)'].dt.total_seconds()
df['Date'] = df['Date'].dt.total_seconds()

In [9]:
for missing_feature in features_missing_values:
    for indexDF, rowDF in df.iterrows():
        if (df.at[indexDF, missing_feature] == 'Unknown'):
            df.at[indexDF,missing_feature] = None

for missing_feature in features_missing_values:
    mu = np.mean(df[missing_feature])
    sigma = np.std(df[missing_feature])
    for indexDF, rowDF in df.iterrows():
        if (pd.isnull(df.at[indexDF, missing_feature])):
            df.at[indexDF,missing_feature] = np.random.normal(loc=mu, scale=sigma)

In [10]:

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

for cat_feature in categorical_features:
    dummies = pd.get_dummies(df[cat_feature])
    df = pd.concat([df, dummies], axis=1)
    
for cat_feature in categorical_features:
    df = df.drop([cat_feature], axis=1)
    

## 5. Spliting the data

In [11]:
from sklearn.model_selection import train_test_split

y = df['Test Outcome']
X = df.drop(['Test Outcome'],1)

X, garboX, y, garboY = train_test_split(X, y, test_size=0.00)

trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.30)

## 6. Training the autoencoder

In [12]:
import matplotlib.pyplot as plt
from keras.layers import Input,Dense
from keras.models import Model,Sequential
from keras.wrappers.scikit_learn import KerasRegressor

np.random.seed(7)
input_dim=trainX.shape[1]
encoding_dim = 5  


input_data = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_data)
decoded = Dense(input_dim, activation='sigmoid')(encoded)
autoencoder = Model(input_data, decoded)
encoder = Model(input_data, encoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
autoencoder.fit(trainX, trainX,
                epochs=100,
                batch_size=256,
                shuffle=True,
                validation_data=(testX, testX))

Train on 79 samples, validate on 35 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100


Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2a6e48f2048>

## 7. Encoding the features

In [13]:
enc_trainX = encoder.predict(trainX)
enc_testX = encoder.predict(testX)

## 8. Training the predictor

In [14]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
# fix random seed for reproducibility
np.random.seed(7)
# split into input (X) and output (Y) variables
input_dim=enc_trainX.shape[1]
output_dim=trainy.ndim
# create model
model = Sequential()
model.add(Dense(3,  input_shape=(input_dim,),  activation='sigmoid'))
model.add(Dense(output_dim, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Early stopping
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
# Fit the model
history = model.fit(x = enc_trainX, y = trainy,
                  epochs=100, 
                  batch_size=256,
                  shuffle=True,
#                   callbacks=callbacks,
                  validation_data=(enc_testX, testy))

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# evaluate the model
Tra_scores = model.evaluate(enc_trainX, trainy)
Val_scores = model.evaluate(enc_testX, testy)
tra_accuracy = Tra_scores[1]*100
val_accuracy = Val_scores[1]*100
accuracy_name = model.metrics_names[1]
print('\n Training', accuracy_name, ': ', tra_accuracy, '%')
print('\n Validation', accuracy_name, ': ', val_accuracy, '%')

Train on 79 samples, validate on 35 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

 Training acc :  74.68354475649097 %

 Validation acc :  82.85714285714286 %


  % get_backend())


In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
trainX_pca = pca.fit_transform(trainX)
testX_pca = pca.transform(testX)

In [19]:
model.fit(x = trainX_pca, y = trainy,
                  epochs=100, 
                  batch_size=256,
                  shuffle=True,
#                   callbacks=callbacks,
                  validation_data=(testX_pca, testy))

Train on 79 samples, validate on 35 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2a6dde37048>