# Read Data

In [13]:
import pandas as pd
import numpy as np

In [14]:
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

# Analysis

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [17]:
df_test.shape

(3263, 4)

In [18]:
df_train.shape

(7613, 5)

In [19]:
print(f'Number of unique values in keyword = {df_train["keyword"].nunique()} (Training) - {df_test["keyword"].nunique()} (Test)')
print(f'Number of unique values in location = {df_train["location"].nunique()} (Training) - {df_test["location"].nunique()} (Test)')

Number of unique values in keyword = 221 (Training) - 221 (Test)
Number of unique values in location = 3341 (Training) - 1602 (Test)


# Process to use Multple Pre trained models

In [20]:
import os
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [21]:
df_train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [23]:
columns_to_drop = ["keyword", "location"]

# Drop columns in df_train and reset index
df_train.drop(columns=columns_to_drop, inplace=True)
df_train.reset_index(drop=True, inplace=True)

# Drop columns in df_test and reset index
df_test.drop(columns=columns_to_drop, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [25]:
# Create the 'disaster' column
df_train['disaster'] = np.where(df_train['target'] == 1, True, False)

# Create the 'no_disaster' column
df_train['no_disaster'] = np.where(df_train['target'] == 0, True, False)

# Drop the original 'target' column if needed
df_train.drop(columns=['target'], inplace=True)

In [28]:
df_train, df_val = train_test_split(df_train, test_size=0.2)

In [29]:
def save_datasets(datasets, *filepaths):
    """Save pandas DataFrames to csv files."""
    for dataset, filepath in zip(datasets, filepaths):
        dataset.to_csv(filepath, index=False)

def load_datasets(*filepaths):
    """Load datasets using the `datasets` library."""
    dataset_files = {name: path for name, path in zip(['train', 'validation'], filepaths)}
    return load_dataset("csv", data_files=dataset_files)

In [30]:
PROCESSED_DATA_PATH = "../data/processed"

save_datasets([df_train, df_test, df_val],
          os.path.join(PROCESSED_DATA_PATH, "train.csv"),
          os.path.join(PROCESSED_DATA_PATH, "test.csv"),
          os.path.join(PROCESSED_DATA_PATH, "val.csv"))


dataset = load_datasets(os.path.join(PROCESSED_DATA_PATH, "train.csv"), os.path.join(PROCESSED_DATA_PATH, "val.csv"))

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'disaster', 'no_disaster'],
        num_rows: 6090
    })
    validation: Dataset({
        features: ['id', 'text', 'disaster', 'no_disaster'],
        num_rows: 1523
    })
})

In [32]:
labels = [label for label in dataset['train'].features.keys() if label not in ['index', 'text']]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

print(labels, id2label, label2id)

['id', 'disaster', 'no_disaster'] {0: 'id', 1: 'disaster', 2: 'no_disaster'} {'id': 0, 'disaster': 1, 'no_disaster': 2}


In [33]:
def preprocess_data(dataset, tokenizer, labels):
    """Preprocess the data for training."""
    def preprocess_batch(examples):
        text = examples["text"]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)

        labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
        labels_matrix = np.zeros((len(text), len(labels)))
        for idx, label in enumerate(labels):
            labels_matrix[:, idx] = labels_batch[label]

        encoding["labels"] = labels_matrix.tolist()
        return encoding

    return dataset.map(preprocess_batch, batched=True, remove_columns=dataset['train'].column_names)

In [34]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

encoded_dataset_dict = preprocess_data(dataset, tokenizer, labels)

encoded_dataset_dict.set_format("torch")

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

# Train

In [48]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from transformers import EarlyStoppingCallback
import torch

In [37]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "id",
    "1": "disaster",
    "2": "no_disaster"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "disaster": 1,
    "id": 0,
    "no_disaster": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [60]:
def binary_metrics(predictions, labels, threshold=0.5):
    # Apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    
    # Use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    # For binary classification, make sure labels and predictions are in proper shape
    labels = labels.reshape(-1)
    y_pred = y_pred.reshape(-1)
    
    # Compute metrics for binary classification
    f1 = f1_score(y_true=labels, y_pred=y_pred)
    roc_auc = roc_auc_score(y_true=labels, y_score=probs)
    accuracy = accuracy_score(y_true=labels, y_pred=y_pred)
    
    # Return as a dictionary
    metrics = {
        "f1": f1,
        "roc_auc": roc_auc,
        "accuracy": accuracy
    }
    return metrics

In [61]:
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=predictions,
        labels=p.label_ids)
    return result

In [66]:
training_arguments = TrainingArguments(
    f"../models/bert-fine-tuned-nlp-disaster-tweets",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [67]:
trainer = Trainer(
    model,
    training_arguments,
    train_dataset=encoded_dataset_dict["train"],
    eval_dataset=encoded_dataset_dict["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [51]:
trainer.evaluate()

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and multilabel-indicator targets