# Machine-based Text Analytics of CyberSecurity Strategies
Uses machine learning to calssify sentences from CyberSecurity documents

**These labels come from the headers in the cyberwellness profiles linked above**

| Category               | Sub category |
|------------------------| -------------|
|LEGAL MEASURES          | CRIMINAL LEGISLATION, REGULATION AND COMPLIANCE|
|TECHNICAL MEASURES      | CIRT, STANDARDS, CERTIFICATION|
|ORGANIZATION MEASURES   | POLICY, ROADMAP FOR GOVERNANCE, RESPONSIBLE AGENCY, NATIONAL BENCHMARKING|
|CAPACITY BUILDING       | STANDARDISATION DEVELOPMENT, MANPOWER DEVELOPMENT, PROFESSIONAL CERTIFICATION, AGENCY CERTIFICATION|
|COOPERATION             | INTRA-STATE COOPERATION, INTRA-AGENCY COOPERATION, PUBLIC SECTOR PARTNERSHIP,  INTERNATIONAL COOPERATION|
|CHILD ONLINE PROTECTION | NATIONAL LEGISLATION,  UN CONVENTION AND PROTOCOL, INSTITUTIONAL SUPPORT, REPORTING MECHANISM|

In [1]:
# When using nltk for the first time, uncomment the following lines and run cell.
# nltk.download must only be downloaded once
import nltk
nltk.download()

showing info http://nltk.github.com/nltk_data/


True

In [2]:
# Python 3
%matplotlib inline
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
import pickle
import random
import json
from collections import Counter
from pprint import pprint

## Preprocessing Data
The following cells read in training samples from a json file, create a lexicon from it, create arrays that store the number of occurences of each word in the lexicon, and serialize the generated list of features.

In [3]:
# Opens training data stored as Json and converts to Python list
with open('results_concatenated.json') as f:    
    data = json.load(f)

pprint(data[:3])

[{u'Country': u'Jordan',
  u'sentence': u'However, these approaches: are generally basic; not systematic; subjective; have no clear definition or boundaries, are not thorough; do not meet international standards; and do not deal effectively with threats emerging from cyberspace.',
  u'sentence_id,': u'ff30d97ab4',
  u'tag': [{u'category': u'technical measures',
            u'subcategory': [u'standards']}]},
 {u'Country': u'Jordan',
  u'sentence': u'Strategies and policies developed by the private sector should augment, comply, and be consistent with this strategy.',
  u'sentence_id,': u'e50e3676b6',
  u'tag': [{u'category': u'organization measures',
            u'subcategory': [u'policy']}]},
 {u'Country': u'Jordan',
  u'sentence': u'security policy and role-based security responsibilities will have a higher rate of success in protecting critical information.',
  u'sentence_id,': u'ddd832b614',
  u'tag': [{u'category': u'organization measures',
            u'subcategory': [u'policy']}]

In [4]:
# Splits data into 3 parts, IDs, sentences, and tags

# For testing purposes
sentence_ids = []

# Lexicons created from sentences will be inputs
sentences  = []

# Tags will be outputs
tags = []

for input_val in data:
    sentence_ids.append(input_val[u'sentence_id,'])
    sentences.append(input_val[u'sentence'])
    tags.append(input_val[u'tag'])

print("Number of training examples is {} \n".format(len(sentences)))
print("First example is \nX: {} \n\n y: {}".format(sentences[0], tags[0]))

Number of training examples is 2045 

First example is 
X: However, these approaches: are generally basic; not systematic; subjective; have no clear definition or boundaries, are not thorough; do not meet international standards; and do not deal effectively with threats emerging from cyberspace. 

 y: [{u'category': u'technical measures', u'subcategory': [u'standards']}]


In [5]:
# Creates lexicon (list of unique words) from all training samples
def create_lexicon(sentences):
    lexicon = []
    for sentence in sentences:
        for word in word_tokenize(sentence):
            root = lemmatizer.lemmatize(word.lower()).encode('utf-8')
            if len(root) > 1 and root not in stop and root not in lexicon:
                lexicon.append(root)
    
    return lexicon

In [6]:
# Creates 2D array containing number of occurences of each word in lexicon in each sample
def produce_X(sentences, lexicon):
    X = []
    for sentence in sentences:
        X_sample = [0 for _ in lexicon]
        for word in word_tokenize(sentence):
            root = lemmatizer.lemmatize(word.lower()).encode('utf-8')
            if root in lexicon:
                X_sample[lexicon.index(root)] += 1
        
        X.append(X_sample)
    
    return np.array(X)

In [7]:
sample_lexicon = create_lexicon(sentences)
pprint(sample_lexicon[:6])

['however', 'approach', 'generally', 'basic', 'systematic', 'subjective']


In [8]:
X = produce_X(sentences, sample_lexicon)

print(X)

[[1 1 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 1 1 1]]


In [9]:
# Pickles features generated for reuse

with open('sample_X.npy', 'wb') as f:
    np.save(f, X)

### Categories (index 0-5)
0. LEGAL MEASURES
1. TECHNICAL MEASURES
2. ORGANIZATION MEASURES
3. CAPACITY BUILDING
4. COOPERATION
5. CHILD ONLINE PROTECTION

> Categories will be stored as a 1D array with each number corresponding to a category listed above. 

In [10]:
# Dictionary stores label names and corresponding index to be turned on in the one hot vector.
category_dict = {
    u'LEGAL MEASURES' : 0,
    u'TECHNICAL MEASURES' : 1,
    u'ORGANIZATION MEASURES' : 2,
    u'CAPACITY BUILDING' : 3,
    u'COOPERATION' : 4,
    u'CHILD ONLINE PROTECTION' : 5
}

In [11]:
def produce_y(tags):
    return np.array([category_dict[tag[0][u'category'].upper()] for tag in tags])

In [12]:
y = produce_y(tags)

print(y)

[1 2 2 ..., 2 1 2]


In [13]:
with open('tags.npy', 'wb') as f:
    np.save(f, y)

## Tensorflow Boilerplate
To simplify the Tensorflow code, we will define a set of functions to delare variables.

In [14]:
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

In [15]:
def fc_layer(X, W, b, name='fc'):
    with tf.name_scope(name):
        return tf.nn.relu(tf.matmul(X, W) + b)

## Loading the Data
Now that the data has been processed it is now time to load the data and fit a model to it

In [16]:
with open("sample_X.npy","rb") as f:
    X = np.load(f)

print(X)

[[1 1 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 1 1 1]]


In [91]:
with open("tags.npy","rb") as f:
    y = np.load(f)

    print(y)

[1 2 2 ..., 2 1 2]


In [70]:
number_of_options = 6

In [92]:
def one_hot(vector):
    def hot_or_not(i, j):
        return 1 if i == j else 0
    return np.array([[int(hot_or_not(i, j)) for j in range(number_of_options)] for i in vector])

In [68]:
def next_batch(X, y, batch_size=100):
    n_batches = len(X) / batch_size
    
    for batch in range(n_batches):
        start = (batch * batch_size) % len(X)
        end = start + batch_size if start + batch_size < len(X) else len(X) - 1
        yield X[start:end], y[start:end]

## Constructing the Model
Now we can create a neural network to fit the data.

In [66]:
lexicon_size = len(X[0])

In [96]:
X_placeholder = tf.placeholder(tf.float32, [None, lexicon_size])
y_placeholder = tf.placeholder(tf.float32, [None, number_of_options])

In [97]:
w1 = weight_variable([lexicon_size, 10])
b1 = bias_variable([10])

model = fc_layer(X_placeholder, w1, b1, name='fc1')

In [98]:
w2 = weight_variable([10, 10])
b2 = bias_variable([10])

model = fc_layer(model, w2, b2, name='fc2')

In [99]:
w3 = weight_variable([10, number_of_options])
b3 = bias_variable([number_of_options])

y_predicted = tf.matmul(model, w3) + b3

In [100]:
cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=y_placeholder, logits=y_predicted))

optimizer = tf.train.AdamOptimizer(0.5).minimize(cross_entropy)

## Splitting up the Data
Now we have to split up the data into train and test 

In [101]:
TEST_SIZE = 0.9
y_hot = one_hot(y)
print(y_hot)

X_split_index = int(len(X)*TEST_SIZE)
y_split_index = int(len(y_hot)*TEST_SIZE)

X_train, X_test = X[:X_split_index], X[X_split_index:]
y_train, y_test = y_hot[:y_split_index], y_hot[y_split_index:]

print(len(X_train))
print(len(y_train))

[[0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 ..., 
 [0 0 1 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]]
1840
1840


## Running the model

In [None]:
    with tf.Session() as sess:
        print("Session starting")
        
        def accuracy(Xt, yt):
            correct_prediction = tf.equal(tf.argmax(y_predicted,1), tf.argmax(y_placeholder,1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            return sess.run(accuracy, feed_dict={X_placeholder: Xt, y_placeholder: yt})

        sess.run(tf.global_variables_initializer())

        for epoch in range(100):
            epoch_loss = 0
            avg_cost = 0.0
            for i, (batch_X, batch_y) in enumerate(next_batch(X_train, y_train)):
                print('batch number: {}'.format(i))
                # print("\nBatch {}\n".format(batch_y))
                sess.run(optimizer, feed_dict={X_placeholder: batch_X, y_placeholder: batch_y})
                print("Train accuracy {}".format(accuracy(X_train, y_train)))

        print("Final Train accuracy {}".format(accuracy(X_train, y_train)))

Session starting
batch number: 0
Train accuracy 0.429891318083
batch number: 1
Train accuracy 0.483695656061
batch number: 2
Train accuracy 0.483695656061
batch number: 3
Train accuracy 0.483695656061
batch number: 4
Train accuracy 0.541304349899
batch number: 5
Train accuracy 0.526630461216
batch number: 6
Train accuracy 0.4375
batch number: 7
