In [462]:
import json
import pprint
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import ast
import os.path
from datetime import datetime
from collections import Counter

nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nefarion/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/nefarion/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nefarion/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [463]:
os.chdir("/mnt/f/Linda/Work work/Categorization")

input_file_relative = "Datasets/Feature_dataset.csv"
output_file_relative = "Datasets/hopefully_the_end.csv"

char_blacklist = list(chr(i) for i in range(32, 127) if i <= 64 or i >= 91 and i <= 96 or i >= 123)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(char_blacklist)
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
top = 100
toker = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)
stemmer = LancasterStemmer()

In [464]:
# Read new generated data set file
df = pd.read_csv(input_file_relative)

# Take only English documents - above a certain threshold

In [465]:
def remove_non_english_documents(data_frame, english_tolerance = 20):
    english_confidence = []
    for i, doc in data_frame.iterrows():
        english_words = 0
        wordies = ast.literal_eval(doc['tokens_en'])
        for w in wordies:
            if w.lower() in english_vocab:
                english_words += 1
        english_confidence.append(english_words / len(wordies) * 100)
    data_frame['english:confidence'] = english_confidence
    
    return data_frame[data_frame['english:confidence'] > english_tolerance]

df = remove_non_english_documents(df)
df = df.head(24)
df = df.tail(1)
pprint.pprint(df)

   main_category  main_category:confidence  \
23          Porn                         1   

                                                  url  \
23  http://jvoa2vvavjxsd6sa5tviw5iy254czhyxnavxvxd...   

                                              content  \
23  Tabu Child Porn CP # Tabu Child Porn cp    * H...   

                                            tokens_en  english:confidence  
23  ['tabu', 'child', 'porn', 'home', 'login', 'se...           57.017544  


# Make the most popular word list for each category

In [466]:
def most_popular_words_per_category(data_frame, top_number = top):
    words_of_category = {}
    for cat in set(data_frame['main_category'].values):
        all_words = []
        for wordies in data_frame[data_frame['main_category'] == cat]['tokens_en'].tolist():
            for w in ast.literal_eval(wordies):
                all_words.append(w)
        all_word_except_stop_dist = nltk.FreqDist(
            stemmer.stem(w.lower()) for w in all_words if w not in stopwords and len(w) >= 3 and w[0] not in char_blacklist
        )
    
        most_common = all_word_except_stop_dist.most_common(top_number)
        words_of_category[cat] = [w for w, number in most_common]
    
    return words_of_category

words_frequency = most_popular_words_per_category(df)

# Dataset creation if it is not existing.
__Dataset is filtered by these set of rules:__
1. Main category != Not_working (Exclude non working URL's)
2. Main category:confidence > 0.5 (Leave url's with likely know categories)
4. Non english language URL's are excluded.


### Remove most frequent words in all categories

In [467]:
def remove_clutter_words(words_per_category):
    from flashtext.keyword import KeywordProcessor
    from collections import Counter
    wordies = []
    for cat in words_per_category.keys():
        wordies.extend(words_per_category[cat][0:15])
    words_counter = Counter(wordies)
    words_filter = {x : words_counter[x] for x in words_counter if words_counter[x] >= 7}
    words_stop = list(words_filter.keys())
    for cat in words_per_category.keys():
        words_per_category[cat] = [w for w in words_per_category[cat] if w not in words_stop]
        
    return words_per_category

words_frequency = remove_clutter_words(words_frequency)

### Get all top words of all categories combined
### Create a keyword processor for each category

In [468]:
from flashtext.keyword import KeywordProcessor
from collections import Counter
all_keywords = []
word_processors = {}
for category in words_frequency.keys():
    all_keywords.extend(words_frequency[category])
    word_processor = KeywordProcessor()
    for word in words_frequency[category]:
        word_processor.add_keyword(word)
    word_processors[category] = word_processor
# remove duplicates    
all_keywords = set(all_keywords)
all_keywords = list(all_keywords)
all_words_processor = KeywordProcessor()
for word in all_keywords:
    all_words_processor.add_keyword(word) 

### Create a percentage function 

In [469]:
# def compute_percentage(dum0, dumx):
#     try:
#         ans=float(dumx)/float(dum0)
#         ans=ans*100
#     except:
#         return 0
#     else:
#         return ans
#    

### Create a function to find the most probable category 

In [470]:
# def guess_category(text, index):
#     x=str(text)
#     total_matches = len(all_words_processor.extract_keywords(x))
#     if total_matches == 0:
#         return 'Not working'
#     
#     matched_keywords_count = {}
#     for p_key in word_processors:
#         processor = word_processors[p_key]
#         matched_keywords_count[p_key] = len(processor.extract_keywords(x))
#         
#     match_per_category = {}
#     for tk_key in matched_keywords_count:
#         matched = matched_keywords_count[tk_key]
#         match_per_category[tk_key] = float(compute_percentage(total_matches, matched))
#         
#     max_prob_category = max(match_per_category, key=(lambda key: match_per_category[key]))
#     return max_prob_category 

# For each website use only its tokenized words and its category

In [471]:
words = []
classes = []
documents = []
counter = 0
for i, row in df.iterrows():
    c = [stemmer.stem(word.lower()) for word, word_count in Counter(ast.literal_eval(row['tokens_en'])).most_common(top)]
    documents.append((c, row['main_category']))
    
words = all_keywords
classes = list(words_frequency)    

# Convert the Language Words into mathematical notations

In [472]:
training = []
output = [] 
output_empty = [0] * len(classes)
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    # output is a '0' for each tag and '1' for current category
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)
    
pprint.pprint(all_keywords[34])

'nud'


# Do the final pre-processing on the data and create some functions:
### Sigmoid Function
### A function for cleaning up sentences
### A function to create a Bag Of Words

In [473]:
def sigmoid(x):
    result = 1/(1+np.exp(-x))
    return result

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(result):
    return result*(1-result)

In [474]:
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # get rid of non English words
    english_words = []
    for sw in sentence_words:
        if sw.lower() in english_vocab:
            english_words.append(sw)
    # stem each word
    stemmer = LancasterStemmer()
    english_words = [stemmer.stem(sw.lower()) for sw in english_words]
    return english_words

In [475]:
def bow(sentence, wordies, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag_of_words = [0]*len(wordies)  
    enumerated_words = enumerate(wordies)
    for s in sentence_words:
        for i,w in enumerated_words:
            if w == s: 
                bag_of_words[i] = 1
                if show_details:
                    print ("found in bag_of_words: %s" % w)

    return np.array(bag_of_words)

### The training function

In [476]:
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):

    print (f"Training with {hidden_neurons} neurons, alpha: {str(alpha)}, dropout:{dropout} {dropout_percent if dropout else ''}")
    print (f"Input matrix: {len(X)}x{len(X[0])}    Output matrix: {1}x{ len(classes)}")
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)
        
    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))
                
        if dropout:
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2

        if (j% 10000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break
                
        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        
        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
        
        if j > 0:
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))        
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': words,
               'classes': classes
              }
    synapse_file = "synapses.json"
    folder_path = "Models/"
    with open(folder_path+synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
    print ("saved synapses to:", synapse_file)

# Train the model

In [477]:
import time
X = np.array(training)
y = np.array(output)

start_time = time.time()

train(X, y, hidden_neurons=10, alpha=0.1, epochs=50000, dropout=False, dropout_percent=0.2)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

Training with 10 neurons, alpha: 0.1, dropout:False 
Input matrix: 1x100    Output matrix: 1x1
delta after 10000 iterations:0.011372703734532918
delta after 20000 iterations:0.00798591032204643
delta after 30000 iterations:0.006501037764156448
delta after 40000 iterations:0.005620483791714315
delta after 50000 iterations:0.005021498395935131
saved synapses to: synapses.json
processing time: 3.391897678375244 seconds


### The helper function used in the neural network testing

In [478]:
def think(sentence, synapses, show_details=False):
    x = bow(sentence.lower(), words, show_details)
    if show_details:
        print ("sentence:", sentence, "\n bow:", x)
    # input layer is our bag of words
    layer_0 = x
    synapse_0 = synapses[0]
    synapse_1 = synapses[1]
    # matrix multiplication of input and hidden layer
    layer_1 = sigmoid(np.dot(layer_0, synapse_0))
    # output layer
    layer_2 = sigmoid(np.dot(layer_1, synapse_1))
    return layer_2

# Test the data

In [484]:
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
syn_file = 'Models/synapses.json' 
with open(syn_file) as data_file: 
    syn = json.load(data_file) 
    syn_0 = np.asarray(syn['synapse0']) 
    syn_1 = np.asarray(syn['synapse1'])

def classify(sentence, show_details=False):
    results = think(sentence, (syn_0, syn_1), show_details)

    results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD ] 
    results.sort(key=lambda x: x[1], reverse=True) 
    return_results =[[classes[r[0]],r[1]] for r in results]
    #print ("\n classification: %s" % ( return_results))
    return return_results

test_text = '56'
classify(test_text)

[['Porn', 0.8395964685686156]]