In [130]:
import pprint
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import os.path
import sys
import re
import time

nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package words to /home/nefarion/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nefarion/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [131]:
os.chdir("/mnt/f/Linda/Work work/Categorization")

input_file_relative = "Datasets/Dark_web_dataset.csv"
output_file_relative = "Datasets/hopefully_the_end.csv"

english_vocab = set(w.lower() for w in nltk.corpus.words.words())
top = 1000
MAX_SEQUENCE_LENGTH = top
MAX_NB_WORDS = top

### Read new generated data set file and set en_tokens to ''

In [132]:
df = pd.read_csv(input_file_relative)[['url', 'content', 'category']]
df['tokens_en', 'confidence'] = ''
# Shuffle the rows and reset the index
df = df.sample(frac=1).reset_index(drop=True)
print(f"Loaded {df.shape[0]} rows in {df.shape[1]} columns")

Loaded 4334 rows in 4 columns


### Take only English documents - above a certain threshold

In [133]:
def remove_non_english_documents(data_frame, english_tolerance = 20):
    removed = 0
    english_confidence = []
    tokens_en = []
    for i, document in data_frame.iterrows():
        english_words = 0
        text = document['content']
        
        # Remove long base-64 encoded strings, e.g. images
        text = re.sub("(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4}){24,}","", text)
        
        wordies = nltk.word_tokenize(text)

        tokens = []
        for w in wordies:
            lower = w.lower()
            if lower in english_vocab:
                tokens.append(lower)
                english_words += 1
        tokens_en.append(tokens)
        doc_english_confidence = english_words / len(wordies) * 100
        english_confidence.append(doc_english_confidence)
        if doc_english_confidence <= english_tolerance:
            removed += 1
        
        if i % 100 == 0:
            print("done {}".format(i))
        sys.stdout.flush()
        time.sleep(0.05)
        
    data_frame['english:confidence'] = english_confidence
    data_frame['tokens_en'] = tokens_en
    print(f"Removed {removed} documents considered non-english.")
    return data_frame[data_frame['english:confidence'] > english_tolerance]

In [None]:
df = remove_non_english_documents(df)

done 0
done 100
done 200
done 300
done 400
done 500
done 600
done 700
done 800
done 900
done 1000
done 1100
done 1200
done 1300
done 1400
done 1500
done 1600
done 1700
done 1800
done 1900
done 2000
done 2100
done 2200
done 2300
done 2400
done 2500
done 2600
done 2700
done 2800
done 2900
done 3000
done 3100
done 3200
done 3300


### Get labels
#### The labels array is a lookup for label (category) names

In [None]:
labels_index = {}  # dictionary mapping label name to numeric id
def get_labels(d_frame):
    data_frame = d_frame.copy(deep = True)
    df_categories = data_frame.drop_duplicates(subset = 'category')
    df_categories = df_categories['category']
    
    index = 0
    for category in df_categories:
        labels_index[category] = index
        index += 1

get_labels(df)

# Preprocess Data
#### Fill the texts array with the raw content of every page
#### Fill the labels_index array with the category id on the index position of the page

In [None]:
texts = []  # list of text samples
labels = []  # list of label ids
def prepare_text_data(data_frame):
    for i, document in data_frame.iterrows():
        text = document['content']
        texts.append(text)
        category = document['category']
        label = labels_index[category]
        labels.append(label)

prepare_text_data(df)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant

BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

### Indexing word vectors

In [None]:
# Build an index of words mapping in the embeddings set to their embedding vector

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors.')

### Prepare tokenizer

In [None]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

### Split the data into a training set and a validation set

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

# Prepare embedding matrix

In [None]:
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
print(num_words)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    #else:
    #    print("could not find the word {} in the embeddings dictionary".format(word))

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
use_glove_embeddings = True #it seems that using glove embeddings doesn't bring up any noticeable improvement
if use_glove_embeddings:
    input_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
else:
    input_layer = Embedding(num_words,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)

# Prepare and train the CNN

#### prepare a 1D convnet with global maxpooling

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = input_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(labels.shape[1], activation='softmax')(x)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

#### train the 1D convnet with global maxpooling

In [None]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=20,
          validation_data=(x_val, y_val))

#### Evaluate the model on the test data using "evaluate"

In [None]:
print('\n# Evaluate on test data of {} examples'.format(len(x_val)))
results = model.evaluate(x_val, y_val, batch_size=128)
run_accuracy = results[1]
print('test loss, test acc:', results)

category_id_to_name_lookup = {v: k for k, v in labels_index.items()}

print('\n# Detailed results for training data set')
predictions = model.predict(x_val)
predictions_copy = predictions.copy()
for idx, val in enumerate(predictions):
    category_index = predictions[idx].argmax(axis=0)
    original_category_index = y_val[idx].argmax(axis=0)
    if original_category_index != category_index:
        print("Predicted {} should have been {}"
              .format(category_id_to_name_lookup[category_index], category_id_to_name_lookup[original_category_index]))

# Dump reusable files: tokenizer.pickle, model.json, model.h5 

In [None]:
print('\n# Dump tokenizer so it can be used for tokenizing on other data with the same word dictionary.')
import pickle
with open('Models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
print("\n# Saving model to disk")
model_json = model.to_json()
with open("Models/DarkWebCategoryModel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Models/DarkWebCategoryModel.h5")

# Create a confusion matrix
#### Prepare headers

In [None]:
sorted_labels = [cat_key for cat_key in labels_index]
sorted_labels.sort()
# Initiate confusion matrix
matrix = {cat_key: {} for cat_key in sorted_labels}
for row in matrix:
    matrix[row] = {cat_key: 0 for cat_key in sorted_labels}

len_of_longest = 0
for label in sorted_labels:
    label_len = len(label)
    len_of_longest = label_len if label_len > len_of_longest else len_of_longest

#### Create the confusion matrix

In [None]:
for idx, val in enumerate(predictions_copy):
    category_index = predictions_copy[idx].argmax(axis=0)
    original_category_index = y_val[idx].argmax(axis=0)
    category = category_id_to_name_lookup[category_index]
    original_category = category_id_to_name_lookup[original_category_index]
    matrix[original_category][category] += 1

#### Simplify the confusion matrix

In [None]:
def get_percentage(res, values_sum):
    return f'{res/values_sum*100:.2f}' if values_sum else '0.00'

simple_matrix = matrix.copy()
for row in simple_matrix:
    values = simple_matrix[row].values()
    values_sum = sum(values)
    results_row = [ '{:11}'.format(f'{res}: {get_percentage(res, values_sum)}%') for res in list(values) ]
    beau_results = [res.replace('"', '') for res in results_row]
    simple_matrix[row] = beau_results

#### Print the confusion matrix

In [None]:
print(f'accuracy: {run_accuracy}')
short_labels = ['OnlnMarket', 'WebCat', 'SexContent', 'FinFraud', 'Other', 'Social', 'HacProgr', 'Gambling', 'IllegalServ']
short_labels.sort()
formatted_short_labels = ['{:11}'.format(lab) for lab in short_labels]
print("{:{len_of_longest}}, {}".format(' ', formatted_short_labels, len_of_longest=len_of_longest))
for row in simple_matrix:
    print("\n{:{len_of_longest}}, {}".format(row, simple_matrix[row], len_of_longest=len_of_longest))

print('category_lookup: ')
pprint.pprint(category_id_to_name_lookup)