In [1]:
import tensorflow as tf
import tensorflow_io
from tensorflow import keras
import tensorboard
import pandas as pd
import numpy as np
import datetime
import tweets_classifier

2024-01-05 19:17:49.846999: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-05 19:17:49.889437: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-05 19:17:49.889497: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-05 19:17:49.889535: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-05 19:17:49.898771: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-05 19:17:49.899340: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

### Define constants

In [17]:
DATA_DIR = "../../Datasets/disaster_tweets"
CHECKPOINT_DIR = "./tweets_classifier/checkpoints"
BATCH = 32
SHUFFLE_SEED = 42

### Define functions

In [18]:
def load_data(preproc: bool = 1):
    # load preprocessed data from parquet
    if preproc:
        train_data = pd.read_parquet(f"{DATA_DIR}/train_preprocessed.parquet")
        test_data = pd.read_parquet(f"{DATA_DIR}/test_preprocessed.parquet")
    else:
    # load unpreprocessed data from csvs
        train_data = [pd.read_csv(f"{DATA_DIR}/train.csv", index_col = 0), pd.read_csv(f"{DATA_DIR}/train2.csv")[["keyword","location","text","choose_one"]]]
        test_data = pd.read_csv(f"{DATA_DIR}/test.csv", index_col = 0)
    return train_data, test_data

def preprocess_and_save():
    train_data, test_data = load_data(preproc = 0)
    
    train_data[1].rename(columns = {"choose_one": "target"}, inplace = True)
    train_data[1]["target"] = (train_data[1]["target"] == "Relevant").astype("int")

    train_data = pd.concat([train_data[0], train_data[1]], axis = 0)

    train_data.fillna("0", inplace = True)
    test_data.fillna("0", inplace = True)

    train_data.to_parquet(f"{DATA_DIR}/train_preprocessed.parquet")
    test_data.to_parquet(f"{DATA_DIR}/test_preprocessed.parquet")

    return train_data, test_data 

### Load data

In [19]:
# train_data, test_data = preprocess_and_save()
# train_data, test_data = load_data(preproc = 0)
train_data, test_data = load_data(preproc = 1)

### Vectorize processed data

In [20]:
train_data_tf = tf.data.Dataset.from_tensor_slices((train_data["keyword"] + train_data["location"] + train_data["text"], train_data["target"])).shuffle(SHUFFLE_SEED).batch(BATCH)
test_data_tf = tf.data.Dataset.from_tensor_slices((test_data["keyword"] + test_data["location"] + test_data["text"])).batch(BATCH)

text_vect = keras.layers.TextVectorization(
    max_tokens = 20_000,
    output_mode = "int",
    output_sequence_length = 165
)
text_vect.adapt(train_data_tf.map(lambda txt, trgt: txt))

train_data_tf_vec = train_data_tf.map(lambda txt, trgt: (text_vect(txt), trgt), num_parallel_calls=tf.data.AUTOTUNE)

test_data_tf_vec = test_data_tf.map(lambda txt: text_vect(txt), num_parallel_calls=tf.data.AUTOTUNE)

val_size = int(0.2 * len(train_data_tf_vec))

validation = train_data_tf_vec.take(val_size)
train = train_data_tf_vec.skip(val_size)

### Define callbacks and train model

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint_callback = keras.callbacks.ModelCheckpoint(f"{CHECKPOINT_DIR}/tweets_classifier.tf", save_best_only=True)
early_stopping_callback = keras.callbacks.EarlyStopping(patience = 4)

history = tweets_classifier.model.model.fit(train, epochs = 120, validation_data = validation, callbacks = [tensorboard_callback, checkpoint_callback, early_stopping_callback])
model.save_weights("tweets_classifier/weights/weights.tf")

### Test model loading

In [6]:
model = tweets_classifier.model.model
model.load_weights("tweets_classifier/weights/weights.tf")
# model.evaluate(validation)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f01fc572560>