In [1]:
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from transformers import AutoTokenizer
import torch
import numpy as np

In [2]:
df = pd.read_csv('data/train.csv')

# Brief look at the data

In [3]:
df.sample(20)

Unnamed: 0,id,keyword,location,text,target
6478,9265,sunk,{GoT | Modern AU | Lizz},@UntamedDirewolf 'I... Wow. Alright.' Sansa sh...,0
3727,5298,fear,Athens - Nicosia,Couples having less sex... for fear it'll be a...,0
4968,7080,meltdown,,Currently: Uncontrollable meltdown number 2,0
6335,9057,structural%20failure,Stateless Global Citizen,The result of failure in correcting structural...,1
6498,9290,sunk,,The Seven Seas - Wreck of the Giannis D. sunk ...,1
1444,2082,casualty,LIVERPOOL,You can't watch 'Home Alone 2' without telling...,0
5920,8452,screamed,,I heard the steven universe theme song from up...,0
7037,10083,typhoon,,Obama Declares Disaster for Typhoon-Devastated...,1
1292,1865,burned,,Watch how bad that fool get burned in coverage...,1
6521,9325,survive,? icon by @Hashiren_3 ?,@mochichiiiii @hikagezero IT'S IMPOSSIBLE FOR ...,0


# Clean the text up a little bit

In [4]:
df['text'] = df['text'].apply(lambda x: x.lower()) # convert to lowercase
df['text'] = df['text'].apply(lambda x: x.replace("#", "")) # remove 

In [5]:
df['text'] = df['text'].apply(lambda x: " ".join([x for x in x.split(" ") if "http" not in x])) # remove hypterlinks
df['text'] = df['text'].apply(lambda x: " ".join([x for x in x.split(" ") if "@" not in x])) # remove tags
df['text'] = df['text'].apply(lambda x: " ".join([x for x in x.split(" ") if "a" != x])) # remove a
df['text'] = df['text'].apply(lambda x: " ".join([x for x in x.split(" ") if "the" != x])) # remove the
df['text'] = df['text'].apply(lambda x: " ".join([x for x in x.split(" ") if "an" != x])) # remove an

In [6]:
df.sample(25)

Unnamed: 0,id,keyword,location,text,target
5793,8266,rioting,,people aren't rioting because justice has been...,1
5028,7169,mudslide,the burrow,dorette thats name of mudslide cake maker,0
4107,5836,hailstorm,USA,hailstorm hey there is secret trick to get 375...,0
6637,9504,terrorist,,me pulling over and fighting hoes that called ...,1
3030,4351,earthquake,Sydney,3novices : renison mine sees seismic event sma...,1
3155,4529,emergency,,eyes of nation &amp; broader conservation comm...,0
6398,9144,suicide%20bomber,,six people were killed thursday when taliban s...,1
618,891,bioterrorism,,creation of ai\nclimate change\nbioterrorism\n...,0
7145,10239,volcano,"cleveland, oh",volcano bowl drink,1
5088,7255,nuclear%20disaster,Marbella. Spain,disaster waiting to happen and your allowing i...,1


# We must turn text into mathematical representation

There are a number of ways to do this, the most simple of which is a count vectorizor, where we simply count the number of times a word shows up in the tweet.


For our transformer example, we use a pre-trained tokenizer. This has the vocabulary step already taken care of (the training), and all it has to do is to map our text documents into encoded vectors.

In [7]:
list(df["text"][0:2].values)

['our deeds are reason of this earthquake may allah forgive us all',
 'forest fire near la ronge sask. canada']

In [8]:
# count_vectorizer = feature_extraction.text.CountVectorizer()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoding = tokenizer(list(df["text"][0:3].values),
                     padding="max_length",
                     truncation=True,
                     max_length=df["text"].apply(len).max())
## let's get counts for the first single tweet in the data
# example_train_vectors = count_vectorizer.fit_transform(df["text"][0:1])

In [9]:
encoding

{'input_ids': [[101, 2256, 15616, 2024, 3114, 1997, 2023, 8372, 2089, 16455, 9641, 2149, 2035, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 3224, 2543, 2379, 2474, 6902, 3351, 21871, 2243, 1012, 2710, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
tokenizer.decode(encoding['input_ids'][1])

'[CLS] forest fire near la ronge sask. canada [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Notice some things about this new encoding.
There is a preceeding 101 and trailing 102 for all rows, these are special start and seperator tokens. There is an additional token type id that is used to signify special sequences (https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/bert#transformers.BertTokenizer). Also there is an attention mask that tells the model which columns to look at. This is because the BERT model requires a square tensor as input, so all input columns must be the same length, and we pad the end with zeros.

In [11]:
df.shape

(7613, 5)

In [12]:
df = df.sample(frac=1).reset_index(drop=True)

# I am now going to make train and test encoded sets.

In [15]:
n_training_samples = 6000
## let's get counts for the first 5 tweets in the data
encoded_train_dataset = tokenizer(list(df["text"][:n_training_samples].values),
                     padding="max_length",
                     truncation=True,
                     max_length=df["text"].apply(len).max())

encoded_test_dataset = tokenizer(list(df["text"][n_training_samples:].values),
                     padding="max_length",
                     truncation=True,
                     max_length=df["text"].apply(len).max())


## This particular model uses "labels" as a keyword for prediction labels.

In [16]:
encoded_train_dataset['labels'] = df[:n_training_samples]['target'].values.astype(np.int64)
encoded_test_dataset['labels'] = df[n_training_samples:]['target'].values.astype(np.int64)

## I use the huggingface "Dataset" object to make training a little easier.

There is actually quite a bit of difficulty that can be had here to get the data the right "type", a Pytorch tensor object of type Float/Long. Using some of this huggingface infrastructure is supposed to help with that... supposed to.

In [19]:
from datasets import Dataset, Features, Value

train_dataset = Dataset.from_dict(encoded_train_dataset)
test_dataset = Dataset.from_dict(encoded_test_dataset)


In [20]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6000
})

In [22]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [23]:
train_dataset['input_ids'].type()

'torch.LongTensor'

## Our "sequences" are just a list of tokens - a sentence.

In [24]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="single_label_classification")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Model training

I will use some of huggingface's model training infrastructure. Things are a bit easier on things like Sagemaker.

In [31]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-binary-class",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
#     metric_for_best_model=metric_name,
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
train_dataset.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Value(dtype='int64', id=None)}

In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
#     compute_metrics=compute_metrics
)

# This is a test to see if the model can compute with no errors

In [158]:
outputs = model(input_ids=train_dataset['input_ids'][0].unsqueeze(0),
                labels=train_dataset[0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.8988, grad_fn=<NllLossBackward0>), logits=tensor([[0.4084, 0.0323]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# We can either train a model or load a pretrained one

I have trained a model and saved it (automatically). For demo purposes it is quicker to just load the model I had trained.

In [None]:
# loaded_model = AutoModelForSequenceClassification.from_pretrained("bert-finetuned-binary-class/checkpoint-750")

In [36]:
# trainer.train()

***** Running training *****
  Num examples = 6000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 109483778


Epoch,Training Loss,Validation Loss
1,0.4715,0.416091


***** Running Evaluation *****
  Num examples = 1613
  Batch size = 8
Saving model checkpoint to bert-finetuned-binary-class/checkpoint-750
Configuration saved in bert-finetuned-binary-class/checkpoint-750/config.json
Model weights saved in bert-finetuned-binary-class/checkpoint-750/pytorch_model.bin
tokenizer config file saved in bert-finetuned-binary-class/checkpoint-750/tokenizer_config.json
Special tokens file saved in bert-finetuned-binary-class/checkpoint-750/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-finetuned-binary-class/checkpoint-750 (score: 0.4160912334918976).


TrainOutput(global_step=750, training_loss=0.44321844482421874, metrics={'train_runtime': 6473.0368, 'train_samples_per_second': 0.927, 'train_steps_per_second': 0.116, 'total_flos': 465583234680000.0, 'train_loss': 0.44321844482421874, 'epoch': 1.0})

In [61]:
outputs = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1613
  Batch size = 8


## Notice the outputs are given
as the output of two functions that are arguments ($\beta$ ) of logistic sigmoid functions. This is because BERT uses the BCEwithLogits loss function and the output of the model is actual not a prediction like standard cross-entropy loss functions.

Sigmoid:     $\sigma = \frac{1}{1-e^{\beta x}}$

![](sigmoid.svg)

In [62]:
outputs

PredictionOutput(predictions=array([[-1.1438643 ,  1.1053957 ],
       [ 1.0622495 , -0.94238526],
       [ 0.8910343 , -0.83420867],
       ...,
       [-1.8952737 ,  1.948417  ],
       [ 0.91456366, -1.2549511 ],
       [ 0.77768123, -0.8434248 ]], dtype=float32), label_ids=array([0, 0, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.4160912334918976, 'test_runtime': 464.8494, 'test_samples_per_second': 3.47, 'test_steps_per_second': 0.435})

loading configuration file bert-finetuned-binary-class/checkpoint-750/config.json
Model config BertConfig {
  "_name_or_path": "bert-finetuned-binary-class/checkpoint-750",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file bert-finetuned-binary-class/checkpoint-750/pytorch_model.bin
All model checkpoint weights were used when initializing Be

# We can calculate predictions on the test set

In [139]:
predictions = []

for i, tweet in enumerate(df["text"][n_training_samples:].values):
    print("Calculating")
    inputs = tokenizer(tweet, return_tensors="pt")
    outputs = loaded_model(**inputs)
    logits = outputs.logits
    probs = sigmoid(logits.squeeze().cpu())
    predictions.append(probs)
    print(i)
    

Calculating
0
Calculating
1
Calculating
2
Calculating
3
Calculating
4
Calculating
5
Calculating
6
Calculating
7
Calculating
8
Calculating
9
Calculating
10
Calculating
11
Calculating
12
Calculating
13
Calculating
14
Calculating
15
Calculating
16
Calculating
17
Calculating
18
Calculating
19
Calculating
20
Calculating
21
Calculating
22
Calculating
23
Calculating
24
Calculating
25
Calculating
26
Calculating
27
Calculating
28
Calculating
29
Calculating
30
Calculating
31
Calculating
32
Calculating
33
Calculating
34
Calculating
35
Calculating
36
Calculating
37
Calculating
38
Calculating
39
Calculating
40
Calculating
41
Calculating
42
Calculating
43
Calculating
44
Calculating
45
Calculating
46
Calculating
47
Calculating
48
Calculating
49
Calculating
50
Calculating
51
Calculating
52
Calculating
53
Calculating
54
Calculating
55
Calculating
56
Calculating
57
Calculating
58
Calculating
59
Calculating
60
Calculating
61
Calculating
62
Calculating
63
Calculating
64
Calculating
65
Calculating
66
Calcu

519
Calculating
520
Calculating
521
Calculating
522
Calculating
523
Calculating
524
Calculating
525
Calculating
526
Calculating
527
Calculating
528
Calculating
529
Calculating
530
Calculating
531
Calculating
532
Calculating
533
Calculating
534
Calculating
535
Calculating
536
Calculating
537
Calculating
538
Calculating
539
Calculating
540
Calculating
541
Calculating
542
Calculating
543
Calculating
544
Calculating
545
Calculating
546
Calculating
547
Calculating
548
Calculating
549
Calculating
550
Calculating
551
Calculating
552
Calculating
553
Calculating
554
Calculating
555
Calculating
556
Calculating
557
Calculating
558
Calculating
559
Calculating
560
Calculating
561
Calculating
562
Calculating
563
Calculating
564
Calculating
565
Calculating
566
Calculating
567
Calculating
568
Calculating
569
Calculating
570
Calculating
571
Calculating
572
Calculating
573
Calculating
574
Calculating
575
Calculating
576
Calculating
577
Calculating
578
Calculating
579
Calculating
580
Calculating
581
Calc

1031
Calculating
1032
Calculating
1033
Calculating
1034
Calculating
1035
Calculating
1036
Calculating
1037
Calculating
1038
Calculating
1039
Calculating
1040
Calculating
1041
Calculating
1042
Calculating
1043
Calculating
1044
Calculating
1045
Calculating
1046
Calculating
1047
Calculating
1048
Calculating
1049
Calculating
1050
Calculating
1051
Calculating
1052
Calculating
1053
Calculating
1054
Calculating
1055
Calculating
1056
Calculating
1057
Calculating
1058
Calculating
1059
Calculating
1060
Calculating
1061
Calculating
1062
Calculating
1063
Calculating
1064
Calculating
1065
Calculating
1066
Calculating
1067
Calculating
1068
Calculating
1069
Calculating
1070
Calculating
1071
Calculating
1072
Calculating
1073
Calculating
1074
Calculating
1075
Calculating
1076
Calculating
1077
Calculating
1078
Calculating
1079
Calculating
1080
Calculating
1081
Calculating
1082
Calculating
1083
Calculating
1084
Calculating
1085
Calculating
1086
Calculating
1087
Calculating
1088
Calculating
1089
Calculati

1514
Calculating
1515
Calculating
1516
Calculating
1517
Calculating
1518
Calculating
1519
Calculating
1520
Calculating
1521
Calculating
1522
Calculating
1523
Calculating
1524
Calculating
1525
Calculating
1526
Calculating
1527
Calculating
1528
Calculating
1529
Calculating
1530
Calculating
1531
Calculating
1532
Calculating
1533
Calculating
1534
Calculating
1535
Calculating
1536
Calculating
1537
Calculating
1538
Calculating
1539
Calculating
1540
Calculating
1541
Calculating
1542
Calculating
1543
Calculating
1544
Calculating
1545
Calculating
1546
Calculating
1547
Calculating
1548
Calculating
1549
Calculating
1550
Calculating
1551
Calculating
1552
Calculating
1553
Calculating
1554
Calculating
1555
Calculating
1556
Calculating
1557
Calculating
1558
Calculating
1559
Calculating
1560
Calculating
1561
Calculating
1562
Calculating
1563
Calculating
1564
Calculating
1565
Calculating
1566
Calculating
1567
Calculating
1568
Calculating
1569
Calculating
1570
Calculating
1571
Calculating
1572
Calculati

In [157]:
predictions[0:3]

[tensor([0.2416, 0.7513], grad_fn=<SigmoidBackward0>),
 tensor([0.7431, 0.2804], grad_fn=<SigmoidBackward0>),
 tensor([0.7091, 0.3028], grad_fn=<SigmoidBackward0>)]

# Check the accuracy
This is a little more difficult with the difference in output of the model. 

In [154]:
from sklearn.metrics import accuracy_score

is_dangerous = [1 if x < 0.5 else 0 for x in [x[0] for x in predictions] ]
test_labels = df[n_training_samples:]['target'].values.astype(np.int64)

accuracy = accuracy_score(is_dangerous, test_labels)
print("Accuracy:", accuracy)

Accuracy: 0.8338499690018599


# Finally let's play around with our newly trained model
And see how we can tweak words to understand what it has learned. Notice how the order of the words now matters.

In [218]:
text = "There is a very serious emergency and a fire at the lodge."
# text = "These chicken wings are fire."

# text = "mudslide on my mountain"
# text = "on my mudslide mountain"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(loaded_model.device) for k,v in encoding.items()}

outputs = loaded_model(**encoding)

logits = outputs.logits
logits.shape

print("Logits:")
print(logits.squeeze().cpu())
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
print("Sigmoid:")
print(probs)
print("Prediction:")
print(probs[1]>0.5)

Logits:
tensor([-1.4906,  1.6541], grad_fn=<SqueezeBackward0>)
Sigmoid:
tensor([0.1838, 0.8394], grad_fn=<SigmoidBackward0>)
Prediction:
tensor(True)


# Notice the outputs
are still inpependent and non-globaly normalized.