In [1]:
!pip install -Uqqq transformers sentence-transformers datasets matplotlib 
!pip install -q transformers[torch]
!pip install -q wandb

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np 
import transformers 
import re 

from datasets import Dataset,load_dataset
from transformers import DistilBertTokenizer,DataCollatorWithPadding

import matplotlib
matplotlib.use('Agg')  # Non-interactive backend
import matplotlib.pyplot as plt

In [3]:
#Preprocessing Tweets
URL_REGEX = re.compile('http(s)?:\/\/t.co\/\w+')
MENTION_REGEX = re.compile('@\w+')
BERT_MODEL = 'distilbert-base-uncased'

def clean_tweet(tweet):
    # remove mentions, the pound sign, and replace urls with URL token
    tweet = re.sub(URL_REGEX, 'url', tweet)  # replace urls with url. Assumes that the mention of a url is significant
    tweet = re.sub(MENTION_REGEX, '', tweet)  # remove mentions entirely
    tweet = tweet.replace('#', '')  # remove pound signs
    
    return tweet.strip()

In [4]:
tweets = pd.read_csv('disaster.csv')

tweets = tweets.assign(
    text = tweets['text'].apply(clean_tweet)
)

del tweets['id']
del tweets['keyword']
del tweets['location']

tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive wildfires evacuation ord...",1
4,Just got sent this photo from Ruby Alaska as s...,1


In [5]:
# the trainer is expecting a 'label' (see the forward method in the docs)
tweets['label'] = tweets['target']
del tweets['target']

tweet_dataset = Dataset.from_pandas(tweets)

In [6]:
tweet_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7613
})

In [7]:
bert_tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL)

bert_tokenizer(['hi', 'hello there'], truncation=True)

{'input_ids': [[101, 7632, 102], [101, 7592, 2045, 102]], 'attention_mask': [[1, 1, 1], [1, 1, 1, 1]]}

In [8]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return bert_tokenizer(examples["text"], truncation=True)

tweet_dataset = tweet_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [9]:
# Dataset has a built in train test split method
tweet_dataset = tweet_dataset.train_test_split(test_size=0.2)

In [10]:
tweet_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})

In [11]:
tweet_dataset.remove_columns('text')  # remove the text column because we don't need to keep it in memory anymore
# this is not required but speeds things up a bit

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})

In [12]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

In [13]:
bert_tokenizer.decode(tweet_dataset['train'][0]['input_ids'])

'[CLS] * blight [SEP]'

In [14]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

sequence_classification_model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

sequence_classification_model.config.id2label = {0: 'NOT DISASTER', 1: 'DISASTER'}

sequence_classification_model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [16]:
batch_size = 32
epochs = 2

training_args = TrainingArguments(
    output_dir='./clf/results',
    logging_dir='./clf/logs',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy='epoch',
    report_to="wandb",  # enable logging to W&B
)

# Define the trainer: 

trainer = Trainer(
    model=sequence_classification_model,
    args=training_args,
    train_dataset=tweet_dataset['train'],
    eval_dataset=tweet_dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [17]:
# Get initial metrics
trainer.evaluate()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'eval_loss': 0.69442218542099,
 'eval_accuracy': 0.556795797767564,
 'eval_runtime': 1.0113,
 'eval_samples_per_second': 1505.977,
 'eval_steps_per_second': 47.463}

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6153,0.392246,0.839133
2,0.1681,0.412987,0.844386


TrainOutput(global_step=382, training_loss=0.3622003828823879, metrics={'train_runtime': 24.3141, 'train_samples_per_second': 500.945, 'train_steps_per_second': 15.711, 'total_flos': 127056933342144.0, 'train_loss': 0.3622003828823879, 'epoch': 2.0})

In [19]:
trainer.save_model()  # save our best model

In [20]:
from transformers import pipeline

# make a classification pipeline
pipe = pipeline("text-classification", './clf/results', tokenizer=BERT_MODEL)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
pipe('this is awful. Such a terrible earthquake')

[{'label': 'DISASTER', 'score': 0.7884670495986938}]

In [22]:
# show scores for all classes
pipe('this is awful. Such a terrible earthquake', return_all_scores=True)



[[{'label': 'NOT DISASTER', 'score': 0.21153301000595093},
  {'label': 'DISASTER', 'score': 0.7884670495986938}]]

In [23]:
%timeit pipe('this is awful. Such a terrible earthquake', return_all_scores=True)

64.7 ms ± 4.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
