/
lodbrok.py
99 lines (74 loc) · 3.41 KB
/
lodbrok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import argparse
description = """ This script assumes to be run at 'models' directory.
It requires 'spambrainz_dataset.pickle'. The purpose of
the script is train the LodBrok model to classify spam and
non_spam editor accounts.The output of this script is
'lodbrok1.h5'."""
parser = argparse.ArgumentParser(description=description)
import numpy as np
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Reshape, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
def get_model() -> Model:
main_input = Input(shape=(9,), name="main_input")
email_input = Input(shape=(1,), dtype="int32", name="email_input")
website_input = Input(shape=(1,), dtype="int32", name="website_input")
bio_input = Input(shape=(512,), name="bio_input")
email_embedding = Embedding(output_dim=256, input_dim=1025, input_length=1, name="email_embedding")(email_input)
website_embedding = Embedding(output_dim=256, input_dim=1025, input_length=1, name="website_embedding")(website_input)
bio_reshape = Reshape((1, 512), input_shape=(512,), name="bio_reshape")(bio_input)
email_lstm = LSTM(32, name="email_lstm")(email_embedding)
website_lstm = LSTM(32, name="website_lstm")(website_embedding)
bio_lstm = LSTM(64, name="bio_lstm")(bio_reshape)
merge = concatenate([website_lstm, email_lstm, bio_lstm, main_input], name="merge")
dropout1 = Dropout(0.5, name="dropout_1")(merge)
dense1 = Dense(64, activation="tanh", name="dense_1")(dropout1)
dropout2 = Dropout(0.5, name="dropout_2")(dense1)
dense2 = Dense(64, activation="tanh", name="dense_2")(dropout2)
output = Dense(2, activation="softmax", name="output")(dense2)
model = Model(inputs=[main_input, website_input, email_input, bio_input], outputs=[output])
adam = Adam()
model.compile(optimizer=adam, loss="mse", metrics=["acc"])
return model
def train_model(model: Model, dataset: np.ndarray, callbacks: list = None) -> None:
model.fit(
{
"main_input": dataset[:, 1:10],
"email_input": dataset[:, 10],
"website_input": dataset[:, 11],
"bio_input": dataset[:, 12:]
},
to_categorical(dataset[:, 0], 2),
epochs=3,
batch_size=32,
callbacks=callbacks,
validation_split=0.2,
)
def evaluate_model(model: Model, dataset: np.ndarray) -> None:
print(model.evaluate(
{
"main_input": dataset[:, 1:10],
"email_input": dataset[:, 10],
"website_input": dataset[:, 11],
"bio_input": dataset[:, 12:]
},
to_categorical(dataset[:, 0], 2),
batch_size=32,
))
print(model.metrics_names)
def load_model(path: str) -> Model:
model = get_model()
model.load_weights(path)
return model
if __name__ == "__main__":
import pickle
import datetime
from keras.callbacks import TensorBoard
with open("../data/spambrainz_dataset.pickle", "rb") as f:
training_data = pickle.load(f)
tensorboard = TensorBoard(log_dir="./logs", write_graph=True, histogram_freq=0)
m = get_model()
train_model(m, training_data, [tensorboard])
m.save("weights/lodbrok1.h5")
# m.save_weights("snapshots/lodbrok-{}.h5py".format(datetime.datetime.now().isoformat()))