In [1]:
import tensorflow as tf
import tensorflow_io
from tensorflow import keras
import tensorboard
import pandas as pd
import nltk
import re
import numpy as np
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim
import spacy
import datetime

In [2]:
data_dir = "../../Datasets/disaster_tweets"

In [2]:
def load_data(preproc = 1):
    if preproc:
        train_data = pd.read_parquet(f"{data_dir}/train_preprocessed.parquet")
        test_data = pd.read_parquet(f"{data_dir}/test_preprocessed.parquet")
    else:
        train_data = [pd.read_csv(f"{data_dir}/train.csv", index_col = 0), pd.read_csv(f"{data_dir}/train2.csv")[["keyword","location","text","choose_one"]]]
        test_data = pd.read_csv(f"{data_dir}/test.csv", index_col = 0)
    return train_data, test_data

def remove_URL(sentence):
    return re.sub(r"http\S+", "", sentence, flags=re.MULTILINE)

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def preprocess(sentence):
    sentence = remove_URL(sentence) 
    sentence = sentence.lower()

    words = nltk.word_tokenize(sentence)
    
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)

    return words 

def bow2(documents, vocab):
    bow = []
    vocab = dict(zip(vocab, np.zeros(len(vocab), dtype = np.int8)))
    for doc in documents:
        freq = vocab.copy()
        for word, count in nltk.FreqDist(doc).items():
            if word in freq.keys():
                freq[word] = count
            else:
                pass
        bow.append(freq)
    return pd.DataFrame(bow)

def preprocess_and_save():
    train_data, test_data = load_data(preproc = 0)

    train_data[1].rename(columns = {"choose_one": "target"}, inplace = True)
    train_data[1]["target"] = (train_data[1]["target"] == "Relevant").astype("int")

    train_data = pd.concat([train_data[0], train_data[1]], axis = 0)

    train_data.fillna("0", inplace = True)
    test_data.fillna("0", inplace = True)

    train_data.to_parquet("../../Datasets/disaster_tweets/train_preprocessed.parquet")
    test_data.to_parquet("../../Datasets/disaster_tweets/test_preprocessed.parquet")

    return train_data, test_data 

@keras.saving.register_keras_serializable()
class PositionalEmbedding(keras.layers.Layer):

    def __init__(self, sentence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.sentence_length = sentence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.token_embeddings = keras.layers.Embedding(input_dim = input_dim, output_dim = output_dim)
        self.position_embeddings = keras.layers.Embedding(input_dim = sentence_length, output_dim = output_dim)

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(0, length, 1)

        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)

        return embedded_tokens + embedded_positions
    
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()

        config.update({
            "output_dim": self.output_dim,
            "sentence_length": self.sentence_length,
            "input_dim": self.input_dim,
        })

        return config

@keras.saving.register_keras_serializable()
class TransformerEncoder(keras.layers.Layer):

    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.num_heads = num_heads

        self.attention = keras.layers.MultiHeadAttention(num_heads = num_heads, key_dim = embed_dim)
        self.feed_forward = keras.models.Sequential([
            keras.layers.Dense(ff_dim, activation = "relu"),
            keras.layers.Dense(embed_dim)
        ])
        self.norm_1 = keras.layers.LayerNormalization()
        self.norm_2 = keras.layers.LayerNormalization()

    def call(self, inputs, mask = None):
        if mask is not None:
                    mask = mask[:, tf.newaxis, :]

        attention_out = self.attention(inputs, inputs, attention_mask = mask)
        norm1_out = self.norm_1(inputs + attention_out)
        ff_out = self.feed_forward(norm1_out)
        norm2_out = self.norm_2(ff_out + norm1_out)

        return norm2_out

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "ff_dim": self.ff_dim,
            "num_heads": self.num_heads,
        })
        return config

In [4]:
# train_data, test_data = preprocess_and_save()
# train_data, test_data = load_data(preproc = 0)
train_data, test_data = load_data(preproc = 1)

In [5]:
train_data_tf = tf.data.Dataset.from_tensor_slices((train_data["keyword"] + train_data["location"] + train_data["text"], train_data["target"])).shuffle(42).batch(32)
test_data_tf = tf.data.Dataset.from_tensor_slices((test_data["keyword"] + test_data["location"] + test_data["text"])).shuffle(42).batch(32)

text_vect = keras.layers.TextVectorization(
    max_tokens = 20_000,
    output_mode = "int",
    output_sequence_length = 165
)
text_vect.adapt(train_data_tf.map(lambda txt, trgt: txt))

train_data_tf_vec = train_data_tf.map(lambda txt, trgt: (text_vect(txt), trgt), num_parallel_calls=tf.data.AUTOTUNE)

test_data_tf_vec = test_data_tf.map(lambda txt: text_vect(txt), num_parallel_calls=tf.data.AUTOTUNE)

In [6]:
# model = keras.Sequential([
#     keras.layers.Input(shape = (None,165), name = "Input"),
#     keras.layers.Dense(150, activation = keras.activations.relu, name = "Dense_200_1"),
#     # keras.layers.Dropout(rate = 0.4, name = "Dropout_1"),
#     keras.layers.Dense(80, activation = keras.activations.relu, name = "Dense_200_2"),
#     # keras.layers.Dropout(rate = 0.4, name = "Dropout_2"),
#     keras.layers.Dense(1, activation = keras.activations.sigmoid, name = "Output")
# ])
# model.compile(
#     optimizer = keras.optimizers.SGD(learning_rate = 0.001, momentum = 0.8),
#     loss = keras.losses.BinaryCrossentropy(),
#     metrics=[keras.metrics.Precision(), keras.metrics.AUC()])

input = keras.layers.Input(shape = (None,), name = "Input", dtype = "int64")
positional = PositionalEmbedding(165, 20_000, 256)(input)
encoder = TransformerEncoder(256, 32, 8)(positional)
pooling = keras.layers.GlobalMaxPooling1D()(encoder)
dropout = keras.layers.Dropout(0.5)(pooling)
output = keras.layers.Dense(1, activation = "sigmoid")(dropout)

model = keras.Model(inputs = input, outputs = output)

model.compile(
    optimizer = keras.optimizers.Adam(learning_rate = 1e-5, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9),
    loss = keras.losses.BinaryCrossentropy(),
    metrics=[keras.metrics.Precision(), keras.metrics.AUC()]
)

In [7]:
val_size = int(0.2 * len(train_data_tf_vec))
validation = train_data_tf_vec.take(val_size)
train = train_data_tf_vec.skip(val_size)

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint_callback = keras.callbacks.ModelCheckpoint("tweets_classifier.tf", save_best_only=True)
early_stopping_callback = keras.callbacks.EarlyStopping(patience = 4)
history = model.fit(train, epochs = 120, validation_data = validation, callbacks = [tensorboard_callback, checkpoint_callback, early_stopping_callback])

In [3]:
keras.models.load_model("tweets_classifier.tf")

<keras.src.engine.functional.Functional at 0x1ac6eef7b90>