In [1]:
import re
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


nltk.download('stopwords')
stopwords_set = list(set(stopwords.words('english')))

df = pd.read_csv("data.csv", sep=";")

X = df["title"].values
Y = df["label"].values.reshape(-1, 1)

print("We are going to use only the titles from the database")
print("[o] X.shape =", X.shape)
print("[o] Y.shape =", Y.shape)

We are going to use only the titles from the database
[o] X.shape = (6334,)
[o] Y.shape = (6334, 1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
maxlen = max([len(x) for x in X])
print("[o] The longest word is", maxlen, "characters long")

[o] The longest word is 289 characters long


In [3]:
init_model = lambda _vocab_size, _maxlen: keras.Sequential([
    layers.Embedding(input_dim=_vocab_size, output_dim=2*_maxlen, input_length=_maxlen),
    layers.Conv1D(256, kernel_size=2, activation="relu"),
    layers.GlobalMaxPool1D(),
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid")
])
model = init_model(X.shape[0], maxlen)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 289, 578)          3661052   
                                                                 
 conv1d (Conv1D)             (None, 288, 256)          296192    
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 512)               131584    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                        

In [4]:
from test import tokenization, tokenization_with_prep, tokenization_without_stopwords, prep_text_with_regex


prep_methods = [
    tokenization,
    tokenization_with_prep,
    tokenization_without_stopwords,
    prep_text_with_regex
]

for prep_method in prep_methods:
    print(f"[o] Results for {prep_method}:")

    seed = 219
    epoch = 15

    np.random.seed(seed)
    tf.random.set_seed(seed)

    maxlen = max([len(x) for x in X])
    new_X = prep_method(X)
    new_X = pad_sequences(new_X, padding="post", maxlen=maxlen)
    model = init_model(np.max(new_X)+1, maxlen)

    print("\t[i] Size of X:", new_X.shape)
    print("\t[i] Max length from X:", maxlen)
    print("\t[i] Number of DL params:", model.count_params())

    X_train, X_test, y_train, y_test = train_test_split(new_X, Y, test_size=0.1, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=True)

    model.compile(metrics=["accuracy"], optimizer="adam", loss="binary_crossentropy")

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        epochs=epoch,
        verbose=True,
        validation_data=(X_val, y_val,),
        #validation_split=0.1,
        callbacks=[early_stop]
    )
        
    loss, acc = model.evaluate(X_test, y_test, verbose=False)
    print(f"[o] [{prep_method}] \t acc =", acc)
    print()

    del X_train, X_test, y_train, y_test, X_val, y_val, model, history

[o] Results for <function tokenization at 0x7f7594687200>:
	[i] Size of X: (6334, 289)
	[i] Max length from X: 289
	[i] Number of DL params: 9274579
Epoch 1/15
Epoch 2/15
Epoch 3/15
[o] [<function tokenization at 0x7f7594687200>] 	 acc = 0.8533123135566711

[o] Results for <function tokenization_with_prep at 0x7f75946873b0>:
	[i] Size of X: (6334, 289)
	[i] Max length from X: 289
	[i] Number of DL params: 7204183
Epoch 1/15
Epoch 2/15
Epoch 3/15
[o] [<function tokenization_with_prep at 0x7f75946873b0>] 	 acc = 0.8217665553092957

[o] Results for <function tokenization_without_stopwords at 0x7f7594687680>:
	[i] Size of X: (6334, 289)
	[i] Max length from X: 289
	[i] Number of DL params: 7163723
Epoch 1/15
Epoch 2/15
Epoch 3/15
[o] [<function tokenization_without_stopwords at 0x7f7594687680>] 	 acc = 0.7917981147766113

[o] Results for <function prep_text_with_regex at 0x7f7594687950>:
	[i] Size of X: (6334, 289)
	[i] Max length from X: 289
	[i] Number of DL params: 6377065
Epoch 1/15
Ep