# Twitter Disaster Tweet Classifier with DistilBERT and Custom Head

The purpose of this model is to determine whether a given Tweet is about a real diaster (war, flood, famine, etc.) or benign. For example, the Tweet "the sky looks beutifully ablaze tonight" likelly does not refer to a real fire. 

### Load and Format

In [None]:
import pandas as pd

# Read from CSV
dataset = pd.read_csv("./data/train.csv").head(100)

# Drop (potentially) unnecessary columns. These may be useful, but I'm not quite ready to work with missing data.
dataset = dataset.drop(["id", "keyword", "location"], axis=1)
dataset.head(20)

## Fine-Tune Pretrained Model for Inference 

Below, I use the HuggingFace `transformers` library to fine-tune DistilBERT on the tweets dataset. 

### Load Imports

In [None]:
from transformers import AutoModel, PreTrainedModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset
from torch import nn
import torch

### Create Dataset

In [None]:
# Init tokenizer for converting text to numbers
model_path = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# In order to add padding on a batch-level rather than a dataset level, add dynamic padding using a data 
# collator. This will add padding to the maximum input in a batch rather than the entire 
# data set which saves computation. 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Read data from CSV, embed, and split into test and train
raw_dataset = Dataset.from_pandas(dataset)
raw_dataset = raw_dataset.rename_column("target", "labels")
raw_dataset = raw_dataset.map(lambda example: tokenizer(example["text"]), batched=True)
raw_dataset = raw_dataset.with_format("torch")
formatted_datasets = raw_dataset.train_test_split(0.2)

# Show Output
formatted_datasets

In [None]:
class TraitDetectionModel(PreTrainedModel):
    def __init__(self, encoding_model, num_labels=2):
        super(TraitDetectionModel, self).__init__(config=encoding_model.config)
        self.num_labels = num_labels
        self.encoder = encoding_model
        input_dimension = encoding_model.config.hidden_size
        self.classifier = nn.Linear(input_dimension, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None):
        encoding = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask
        )
        cls_tensor = encoding[1]
        logits = self.classifier(cls_tensor)

        loss = None
        if labels is not None:
            loss_function = nn.CrossEntropyLoss()
            loss = loss_function(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=encoding.hidden_states,
            attentions=encoding.attentions
        )
    
    def save_pretrained(self, save_directory, state_dict = None):
        self.encoder.save_pretrained(save_directory)
        file_name = "TraitDetectionModel-" + self.encoder.base_model.base_model_prefix + ".pt"
        torch.save(self.state_dict(), save_directory + f"/{file_name}")
    
    def from_pretrained(model_path):
        encoder = AutoModel.from_pretrained(model_path)
        file_name = "TraitDetectionModel-" + encoder.base_model.base_model_prefix + ".pt"
        new_model = TraitDetectionModel(encoder)
        state_dictionary = torch.load(model_path + f"/{file_name}")
        new_model.load_state_dict(state_dictionary)
        return new_model
   

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = TraitDetectionModel(AutoModel.from_pretrained(model_path))
model.to(device)

print(f"Running on Device Type: {device.type}")


In [None]:
training_arguments = TrainingArguments("test-trainer", num_train_epochs=1)
trainer = Trainer(
    model,
    training_arguments,
    train_dataset=formatted_datasets["train"],
    eval_dataset=formatted_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
# Save Combined Method
model.save_pretrained("./models/custom-method/")

# Save PyTorch Method
torch.save(model, "./models/pytorch/full_model.pt")

In [None]:
pytorch_loaded = torch.load("models/pytorch/full_model.pt") # Load Pytorch Model
combined_loaded = TraitDetectionModel.from_pretrained("models/custom-method") # Load Combined Method

In [None]:
model.eval()
pytorch_loaded.eval()
combined_loaded.eval()

inputs = tokenizer("The sky is ablaze", return_tensors="pt")
print(model(**inputs)) # The original BERT model with custom classifier
print(pytorch_loaded(**inputs)) # The model that was loaded only from the local PyTorch file
print(combined_loaded(**inputs)) # The model that was loaded from the local PyTorch and hugigng face config using a combined approach

In [None]:
print(torch.equal(model.encoder(**inputs)[0][:, 0, :], model.encoder(**inputs)[0][:, 0, :]))
print(torch.equal(model.encoder(**inputs)[0][:, 0, :], pytorch_loaded.encoder(**inputs)[0][:, 0, :]))
print(torch.equal(model.encoder(**inputs)[0][:, 0, :], combined_loaded.encoder(**inputs)[0][:, 0, :]))
print(torch.equal(model.classifier.weight, pytorch_loaded.classifier.weight))
print(torch.equal(model.classifier.weight, combined_loaded.classifier.weight))
print(torch.equal(model.encoder(**inputs)[1], model.encoder(**inputs)[1]))

In [None]:
print(torch.nn.functional.softmax(model(**inputs).logits))
print(torch.nn.functional.softmax(pytorch_loaded(**inputs).logits))
print(torch.nn.functional.softmax(combined_loaded(**inputs).logits))