In [2]:
import tensorflow as tf
import tensorflow_io
from tensorflow import keras
import tensorboard
import pandas as pd
import nltk
import re
import numpy as np
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim
import spacy
import datetime
from custom_layers import *

In [18]:
data_dir = "../../Datasets/disaster_tweets"

In [19]:
def load_data(preproc = 1):
    if preproc:
        train_data = pd.read_parquet(f"{data_dir}/train_preprocessed.parquet")
        test_data = pd.read_parquet(f"{data_dir}/test_preprocessed.parquet")
    else:
        train_data = [pd.read_csv(f"{data_dir}/train.csv", index_col = 0), pd.read_csv(f"{data_dir}/train2.csv")[["keyword","location","text","choose_one"]]]
        test_data = pd.read_csv(f"{data_dir}/test.csv", index_col = 0)
    return train_data, test_data

def preprocess_and_save():
    train_data, test_data = load_data(preproc = 0)

    train_data[1].rename(columns = {"choose_one": "target"}, inplace = True)
    train_data[1]["target"] = (train_data[1]["target"] == "Relevant").astype("int")

    train_data = pd.concat([train_data[0], train_data[1]], axis = 0)

    train_data.fillna("0", inplace = True)
    test_data.fillna("0", inplace = True)

    train_data.to_parquet("../../Datasets/disaster_tweets/train_preprocessed.parquet")
    test_data.to_parquet("../../Datasets/disaster_tweets/test_preprocessed.parquet")

    return train_data, test_data 

In [20]:
# train_data, test_data = preprocess_and_save()
# train_data, test_data = load_data(preproc = 0)
train_data, test_data = load_data(preproc = 1)

In [21]:
train_data_tf = tf.data.Dataset.from_tensor_slices((train_data["keyword"] + train_data["location"] + train_data["text"], train_data["target"])).shuffle(42).batch(32)
test_data_tf = tf.data.Dataset.from_tensor_slices((test_data["keyword"] + test_data["location"] + test_data["text"])).shuffle(42).batch(32)

text_vect = keras.layers.TextVectorization(
    max_tokens = 20_000,
    output_mode = "int",
    output_sequence_length = 165
)
text_vect.adapt(train_data_tf.map(lambda txt, trgt: txt))

train_data_tf_vec = train_data_tf.map(lambda txt, trgt: (text_vect(txt), trgt), num_parallel_calls=tf.data.AUTOTUNE)

test_data_tf_vec = test_data_tf.map(lambda txt: text_vect(txt), num_parallel_calls=tf.data.AUTOTUNE)

In [6]:
# model = keras.Sequential([
#     keras.layers.Input(shape = (None,165), name = "Input"),
#     keras.layers.Dense(150, activation = keras.activations.relu, name = "Dense_200_1"),
#     # keras.layers.Dropout(rate = 0.4, name = "Dropout_1"),
#     keras.layers.Dense(80, activation = keras.activations.relu, name = "Dense_200_2"),
#     # keras.layers.Dropout(rate = 0.4, name = "Dropout_2"),
#     keras.layers.Dense(1, activation = keras.activations.sigmoid, name = "Output")
# ])
# model.compile(
#     optimizer = keras.optimizers.SGD(learning_rate = 0.001, momentum = 0.8),
#     loss = keras.losses.BinaryCrossentropy(),
#     metrics=[keras.metrics.Precision(), keras.metrics.AUC()])

input = keras.layers.Input(shape = (None,), name = "Input", dtype = "int64")
positional = PositionalEmbedding(165, 20_000, 256)(input)
encoder = TransformerEncoder(256, 32, 8)(positional)
pooling = keras.layers.GlobalMaxPooling1D()(encoder)
dropout = keras.layers.Dropout(0.5)(pooling)
output = keras.layers.Dense(1, activation = "sigmoid")(dropout)

model = keras.Model(inputs = input, outputs = output)

model.compile(
    optimizer = keras.optimizers.Adam(learning_rate = 1e-5, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9),
    loss = keras.losses.BinaryCrossentropy(),
    metrics=[keras.metrics.Precision(), keras.metrics.AUC()]
)

In [7]:
val_size = int(0.2 * len(train_data_tf_vec))
validation = train_data_tf_vec.take(val_size)
train = train_data_tf_vec.skip(val_size)

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint_callback = keras.callbacks.ModelCheckpoint("tweets_classifier.tf", save_best_only=True)
early_stopping_callback = keras.callbacks.EarlyStopping(patience = 4)
history = model.fit(train, epochs = 120, validation_data = validation, callbacks = [tensorboard_callback, checkpoint_callback, early_stopping_callback])

In [54]:
keras.models.load_model("tweets_classifier.tf")

<keras.src.engine.functional.Functional at 0x23c5b5c5d10>