In [1]:
import numpy as np
import keras
import pickle
import sys

from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.models import model_from_json

from itertools import product

In [20]:
def bigram_generator(x):
    for i in range(2,len(x)+1):
        yield x[i-2:i]


def string2matrix(in_str):
    arr = np.zeros((1, len(vocab)))
    for bigram in bigram_generator(in_str):
        j = bigram2ind[bigram]
        arr[0][j] = 1
    return arr


def accuracy(test_set, true_labels):
    matrix = np.zeros((len(test_set), len(vocab)))
    for i, x in enumerate(test_set):
        for bigram in bigram_generator(x):
            j = bigram2ind[bigram]
            matrix[i][j] = 1

    M = model.predict(matrix)
    pred = np.argmax(M, 1)

    error = sum(pred ^ true_labels)

    return (len(test_set)-error)/len(test_set) # accuracy


def eval_list(strings, ignore_invalid=False, warnings=False):
    matrix = np.zeros((len(strings), len(vocab)))
    for i, x in enumerate(strings):
        for bigram in bigram_generator(bytes(x, 'utf-8')):
            try:
                j = bigram2ind[bigram]
            except:
                if ignore_invalid:
                    if warnings:
                        print("Invalid character in string {}".format(x),
                            file=sys.stderr)
                    continue
                else:
                    raise
            matrix[i][j] = 1

    M = model.predict(matrix)
    pred = np.argmax(M, 1)

    return zip(strings, pred)


def eval_string(string):
    labels = ['non-random', 'random']
    pred = model.predict(string2matrix(string))
    print("label: %s; confidence: %f%%" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))


In [2]:
training_data = np.genfromtxt(
    'train.tsv',
    delimiter='\t', usecols=(0,1), dtype=None, comments=None)

train_x = [x[0] for x in training_data]
train_y = np.asarray([x[1] for x in training_data])

def bigram_generator(x):
    for i in range(2,len(x)+1):
        yield x[i-2:i]

  """


In [4]:
letters = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'

bigram2ind = dict()   # bigram -> index
vocab      = list()   # index  -> bigram
vocab_size = 0

for a, b in product(letters, repeat=2): # iterate in pairs
    bigram = bytes(a + b, 'utf-8')
    if bigram in bigram2ind:
        continue
    else:
        bigram2ind[bigram] = vocab_size
        vocab.append(bigram)
        vocab_size += 1

In [5]:
matrix = np.zeros((len(train_x), vocab_size))
for i, x in enumerate(train_x):
    for bigram in bigram_generator(x):
        j = bigram2ind[bigram]
        matrix[i][j] = 1

train_x = matrix
train_y = keras.utils.to_categorical(train_y, 2)


In [6]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])


In [7]:
model.fit(train_x, train_y,
  batch_size=32,
  epochs=4,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f2b4b098fd0>

In [8]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

In [9]:
pickle.dump(vocab, open('vocab.pickle', 'wb'))

In [10]:
vocab = pickle.load(open('vocab.pickle', 'rb'))
bigram2ind = dict(((bigram, i) for i, bigram in enumerate(vocab)))

In [11]:
with open('model.json', 'r') as model_file:
    model = model_from_json(model_file.read())
    model.load_weights('model.h5')

In [13]:
# Test

if __name__ == '__main__':
    test_data = np.genfromtxt(
        'train.tsv',
        delimiter='\t', usecols=(0,1), dtype=None, comments=None)

    test_strings = [obs[0] for obs in test_data]
    true_labels  = [obs[1] for obs in test_data]

    print(accuracy(test_strings, true_labels))

  


0.9931985294117647


In [21]:
!python3 ./classifier "list_to_classify"

2020-08-12 20:04:42.449761: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-08-12 20:04:44.406948: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-08-12 20:04:44.410155: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-08-12 20:04:44.410216: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (3b9f04d29f39): /proc/driver/nvidia/version does not exist
2020-08-12 20:04:44.418743: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2300000000 Hz
2020-08-12 20:04:44.419042: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2958bc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-08-12 20:04:44.419085: I tensorflow/com