In [47]:
import pandas as pd
import numpy as np

from tokenizers import ByteLevelBPETokenizer

import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from transformers import TFBertModel,  BertConfig, BertTokenizer, AutoModel, PreTrainedTokenizerFast
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, LSTM, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
import pickle
import warnings

In [33]:
dataset = pd.read_csv('data_cleaned.csv',
                   converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})

dataset = dataset[['unstemmed_desc']].dropna()
dataset.head()

Unnamed: 0,unstemmed_desc
0,vs cakephp vs zend vs cakephp vs zend cakephp ...
1,tools generating mock data tools generating mo...
2,laravel use statement non name cache effect la...
3,add client authentication add client authentic...
4,variable error variable error system namespace...


In [34]:
dataset.to_csv(r'pandas.txt', header=None, index=None, sep=' ', mode='a')

In [35]:
path = "pandas.txt"

# Tokenizer from scratch

https://arxiv.org/abs/1508.07909 <br/>
https://huggingface.co/blog/how-to-train

In [36]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

In [37]:
tokenizer.train(files=path, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [48]:
# Save the tokenizer you trained
tokenizer.save("byte-level-BPE.tokenizer.json")

# Load it using transformers
tokenizer = PreTrainedTokenizerFast(tokenizer_file="byte-level-BPE.tokenizer.json")

## Training a standard Neural Network

In [49]:
# Freeze the randomness
from numpy.random import seed
seed(1337)
import tensorflow as tf
tf.random.set_seed(42)

In [50]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [51]:
globalStrategy = "retrain" # Either retrain or keep

In [52]:
def scoring(data, predict):
    f1 = []
    jaccard = []
    threshold = []
    for i in np.arange(0.30, 0.99, 0.01):
        predict_ = np.where(predict >= i, 1, 0)
        f1_micro = metrics.f1_score(data, predict_, average = 'micro')
        jaccard_micro = metrics.jaccard_score(data, predict_, average = 'micro')
        f1.append(f1_micro)
        jaccard.append(jaccard_micro)
        threshold.append(i)
    
    results = pd.DataFrame()
    results['Threshold'] = threshold
    results['F1_micro'] = f1
    results['Jaccard_micro'] = jaccard
    results = results[results['F1_micro'] == results['F1_micro'].max()]
    return results

In [53]:
data = pd.read_csv('data_cleaned.csv',
                   converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})
#data = pd.read_csv('data_cleaned2.csv',
#                   converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})

data = data[['desc', 'unstemmed_desc', 'preprocessedTags', 'Tag1']].dropna()
data.head()

Unnamed: 0,desc,unstemmed_desc,preprocessedTags,Tag1
0,vs cakephp vs zend vs cakephp vs zend cakephp ...,vs cakephp vs zend vs cakephp vs zend cakephp ...,[php],php
1,tool generat mock data tool generat mock data ...,tools generating mock data tools generating mo...,[testing],testing
2,laravel use statement non name cach effect lar...,laravel use statement non name cache effect la...,"[php, laravel]",php
3,add client authent add client authent server r...,add client authentication add client authentic...,[java],java
4,variabl error variabl error system namespac cl...,variable error variable error system namespace...,[c#],c#


In [54]:
x = data['desc']
x_unstemmed = data['unstemmed_desc']
y = data['preprocessedTags']
# y_tag = data['Tag1']
mb = MultiLabelBinarizer()
y_encoded = mb.fit_transform(y)
# y_encoded = mb.fit_transform(y_tag)

In [55]:
# x_train, x_test split
x_train, x_test, y_train, y_test = train_test_split(x_unstemmed,
                                                    y_encoded,
                                                    test_size=0.2,
                                                    random_state=42)

In [57]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [58]:
# Ready output data for the model
max_length=250

x_train_toke = tokenizer(text=x_train.to_list(),
                         add_special_tokens=True,
                         max_length=max_length,
                         truncation=True,
                         padding=True, 
                         return_tensors='tf',
                         return_token_type_ids=False,
                         return_attention_mask=False,
                         verbose=True)

x_test_toke = tokenizer(text=x_test.to_list(),
                        add_special_tokens=True,
                        max_length=max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='tf',
                        return_token_type_ids=False,
                        return_attention_mask=False,
                        verbose=True)

In [61]:
x_train_toke = x_train_toke['input_ids']
x_test_toke = x_test_toke['input_ids']

In [98]:
# Define the model
xInput = Input(shape=(max_length))
x_ = Dense(500, activation='relu')(xInput)
x_ = Dense(500, activation='relu')(x_)
output = Dense(len(y_encoded[0]), activation='sigmoid')(x_)

personalizedTokenizer = Model(inputs=xInput, outputs=output, name='Baseline')

# Compile the model
personalizedTokenizer.compile(loss=CategoricalCrossentropy(from_logits=True, label_smoothing=0.1),
                              optimizer=Adam(learning_rate=0.0000001),
                              metrics=['acc'])

In [99]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=10,
                                            mode='max',
                                            restore_best_weights=True)

In [100]:
personalizedTokenizer.summary()

Model: "Baseline"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
dense_11 (Dense)             (None, 500)               125500    
_________________________________________________________________
dense_12 (Dense)             (None, 500)               250500    
_________________________________________________________________
dense_13 (Dense)             (None, 206)               103206    
Total params: 479,206
Trainable params: 479,206
Non-trainable params: 0
_________________________________________________________________


In [101]:
epochs = 200
batch_size = 128

In [102]:
# Load the baseline, if does not exist then train one
if globalStrategy == 'retrain' or globalStrategy == 'retrainPersonalizedTokenizer':
    epochs = epochs
    batch_size=batch_size
    history = personalizedTokenizer.fit(x_train_toke, y_train,
                                        epochs=epochs,
                                        validation_split=0.1,
                                        callbacks=[callback],
                                        verbose=1)

    personalizedTokenizer.save('/home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/')
    personalizedTokenizer = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/')
else:
    try:
        personalizedTokenizer = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/')
    except OSError:
        epochs = epochs
        batch_size=batch_size
        history = personalizedTokenizer.fit(x_train_toke, y_train,
                                            epochs=epochs,
                                            validation_split=0.1,
                                            callbacks=[callback],
                                            verbose=1)

        personalizedTokenizer.save('/home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/')
        personalizedTokenizer = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
INFO:tensorflow:Assets written to: /home/mlmaster/Code/Ing_ml_P7/personalizedTokenizer/assets


In [103]:
personalizedTokenizer.evaluate(x_test_toke, y_test, verbose=1)



[7043.7265625, 0.03995324671268463]

In [104]:
predict = personalizedTokenizer.predict(x_test_toke)

In [105]:
#predict_ = np.where(predict>0.55, 1, 0)
scoring(y_test, predict)

Unnamed: 0,Threshold,F1_micro,Jaccard_micro
68,0.98,0.021899,0.011071


In [106]:
predict[12]

array([1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
       0., 0.], dtype=float32)

In [107]:
print(x_train[15])
print(tokenizer.tokenize(x_train[15]))
print(x_train_toke[15])

read msg files read msg files need read outlook msg file net without com api outlook cos installed machines app run free 3rd party libraries want extract cc fields sent receive date fields would good also stored msg files
['read', 'Ġmsg', 'Ġfiles', 'Ġread', 'Ġmsg', 'Ġfiles', 'Ġneed', 'Ġread', 'Ġoutlook', 'Ġmsg', 'Ġfile', 'Ġnet', 'Ġwithout', 'Ġcom', 'Ġapi', 'Ġoutlook', 'Ġcos', 'Ġinstalled', 'Ġmachines', 'Ġapp', 'Ġrun', 'Ġfree', 'Ġ3', 'rd', 'Ġparty', 'Ġlibraries', 'Ġwant', 'Ġextract', 'Ġcc', 'Ġfields', 'Ġsent', 'Ġreceive', 'Ġdate', 'Ġfields', 'Ġwould', 'Ġgood', 'Ġalso', 'Ġstored', 'Ġmsg', 'Ġfiles']
tf.Tensor(
[10192  1663  1574   479   479   568  3844  1663  1574   479   479   568
  1121  9203  1574  1643   568  2474   584   417   454  1663  1574   479
   479  1967  1574   576  1168  2147  1035   289   526   289   526   289
   526  1039  8172   289  1942  1942   454  1092   575  1574   636   723
  1092  1092  1092   636   636   723   408  1092  1574   636   408  5890
  1043  1574   636  

# Retrain Partially a Tokenizer