# Read Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

# Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

In [None]:
df_test.shape

In [None]:
df_train.shape

In [None]:
print(f'Number of unique values in keyword = {df_train["keyword"].nunique()} (Training) - {df_test["keyword"].nunique()} (Test)')
print(f'Number of unique values in location = {df_train["location"].nunique()} (Training) - {df_test["location"].nunique()} (Test)')

# Process to use Multple Pre trained models

In [None]:
import os
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [None]:
df_train.columns

In [None]:
columns_to_drop = ["keyword", "location"]

# Drop columns in df_train and reset index
df_train.drop(columns=columns_to_drop, inplace=True)
df_train.reset_index(drop=True, inplace=True)

# Drop columns in df_test and reset index
df_test.drop(columns=columns_to_drop, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
# Create the 'disaster' column
df_train['disaster'] = np.where(df_train['target'] == 1, True, False)

# Create the 'no_disaster' column
df_train['no_disaster'] = np.where(df_train['target'] == 0, True, False)

# Drop the original 'target' column if needed
df_train.drop(columns=['target'], inplace=True)

In [None]:
df_train, df_val = train_test_split(df_train, test_size=0.2)

In [None]:
def save_datasets(datasets, *filepaths):
    """Save pandas DataFrames to csv files."""
    for dataset, filepath in zip(datasets, filepaths):
        dataset.to_csv(filepath, index=False)

def load_datasets(*filepaths):
    """Load datasets using the `datasets` library."""
    dataset_files = {name: path for name, path in zip(['train', 'validation'], filepaths)}
    return load_dataset("csv", data_files=dataset_files)

In [None]:
PROCESSED_DATA_PATH = "../data/processed"

save_datasets([df_train, df_test, df_val],
          os.path.join(PROCESSED_DATA_PATH, "train.csv"),
          os.path.join(PROCESSED_DATA_PATH, "test.csv"),
          os.path.join(PROCESSED_DATA_PATH, "val.csv"))


dataset = load_datasets(os.path.join(PROCESSED_DATA_PATH, "train.csv"), os.path.join(PROCESSED_DATA_PATH, "val.csv"))

In [None]:
dataset

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['index', 'text']]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

print(labels, id2label, label2id)

In [None]:
def preprocess_data(dataset, tokenizer, labels):
    """Preprocess the data for training."""
    def preprocess_batch(examples):
        text = examples["text"]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)

        labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
        labels_matrix = np.zeros((len(text), len(labels)))
        for idx, label in enumerate(labels):
            labels_matrix[:, idx] = labels_batch[label]

        encoding["labels"] = labels_matrix.tolist()
        return encoding

    return dataset.map(preprocess_batch, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

encoded_dataset_dict = preprocess_data(dataset, tokenizer, labels)

encoded_dataset_dict.set_format("torch")

# Train

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from transformers import EarlyStoppingCallback
import torch

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {"f1": f1_micro_average,
               "roc_auc": roc_auc,
               "accuracy": accuracy}
    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=predictions,
        labels=p.label_ids)
    return result

In [None]:
training_arguments = TrainingArguments(
    f"../models/bert-fine-tuned-nlp-disaster-tweets-v1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
trainer = Trainer(
    model,
    training_arguments,
    train_dataset=encoded_dataset_dict["train"],
    eval_dataset=encoded_dataset_dict["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

In [None]:
trainer.train()

In [None]:
trainer.evaluate()