In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


df = pd.read_csv("data/data.csv", sep=";")

X = df["title"].values
Y = df["label"].values.reshape(-1, 1)

print("We are going to use only the titles from the database")
print("[o] X.shape =", X.shape)
print("[o] Y.shape =", Y.shape)

We are going to use only the titles from the database
[o] X.shape = (6334,)
[o] Y.shape = (6334, 1)


In [5]:
maxlen = max([len(x) for x in X])
print("[o] The longest word is", maxlen, "characters long")

[o] The longest word is 289 characters long


In [6]:
init_model = lambda _maxlen=289: keras.Sequential([
    layers.Embedding(input_dim=X.shape[0], output_dim=2*_maxlen, input_length=_maxlen),
    layers.Conv1D(256, kernel_size=2, activation="relu"),
    layers.GlobalMaxPool1D(),
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid")
])
model = init_model(maxlen)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 289, 578)          3661052   
                                                                 
 conv1d_1 (Conv1D)           (None, 288, 256)          296192    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 512)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 513       
                                                      

In [7]:
from test import tokenization


prep_methods = [
    tokenization
]

early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True)


for prep_method in prep_methods:
    print(f"[o] Results for {prep_method}:")

    seed = 7
    epoch = 15

    np.random.seed(seed)
    tf.random.set_seed(seed)

    maxlen = max([len(x) for x in X])
    new_X = prep_method(X)
    new_X = pad_sequences(new_X, padding="post", maxlen=maxlen)
    model = init_model(maxlen)

    print("\t[i] Size of X:", new_X.shape)
    print("\t[i] Max length from X:", maxlen)
    print("\t[i] Number of DL params:", model.count_params())

    X_train, X_test, y_train, y_test = train_test_split(new_X, Y, test_size=0.1, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=True)

    model.compile(metrics=["accuracy"], optimizer="adam", loss="binary_crossentropy")
    history = model.fit(
        X_train, y_train,
        epochs=epoch,
        verbose=True,
        validation_data=(X_val, y_val,),
        #validation_split=0.1,
        callbacks=[
            early_stop
        ]
    )
        
    loss, acc = model.evaluate(X_test, y_test, verbose=False)
    print("[o] acc =", acc)
    print()

    del X_train, X_test, y_train, y_test, X_val, y_val, model, history
    break

[o] Results for <function tokenization at 0x17fe955e0>:
	[i] Size of X: (6334, 289)
	[i] Max length from X: 289
	[i] Number of DL params: 4089341
Epoch 1/15
Epoch 2/15
Epoch 3/15
[o] acc = 0.8533123135566711

