In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers evaluate #Installing the transfomers and evaluate libraries

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.3 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 50.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.4 MB/s 
Collecting datasets>=2.0.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 74.1 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2

In [None]:
#Imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
import evaluate
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from sklearn.model_selection import train_test_split

In [None]:
#Importing datasets
t_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nlp-getting-started/train.csv")
ts_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nlp-getting-started/test.csv")

Loading bert-base-cased from HuggingFace using AutoModelForSequenceClassification as we are attempting to classify a sequence of encoded text using the tokenizer into two classes (hence num_labels=2).

We also load the tokenizer from the same model.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
t_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Creating a dataset object out of our dataframe for input into our huggingface model and then creating a tokenizer function to tokenize each piece of text with padding=True. This means that the inputs will be padded until they are the same length as the longest sequence of tokens.

In [None]:
t_df1 = t_df[['text', 'target']]
t_df1 = t_df1.rename(columns={'target':'label'})
t_ds = Dataset.from_pandas(t_df1).train_test_split(test_size=0.3, shuffle=False)

In [None]:
t_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5329
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2284
    })
})

In [None]:
def tokenize_func(examples):
  return tokenizer(examples["text"], padding=True, truncation=True)

Applying the tokenize function to our dataset.

In [None]:
tokenized_ds = t_ds.map(tokenize_func, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5329
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2284
    })
})

Setting the training arguments for our fine-tuning, this will be used as an argument for the trainer. These hyperparameters and more can be optimised using Population Based Training, though this may be overkill for our application.

In [None]:
#TrainingArguments
args = TrainingArguments(
    output_dir="bert-finetune", 
    evaluation_strategy="epoch",#How often evaluation occurs during training
    num_train_epochs=3)#Numer of epochs the model will train for

In order to calculate any metrics with a HuggingFace model we first need to specify the metric we want using the load_metric function. In our case the accuracy metric function fortunately already exists, had it not we could simply write our own accuracy function. 

Then we write a compute_metrics function which will be an argument for our trainer. As all HuggingFace transformers models output in logits we need to find the max of the two logits calculated for each class and select that as our prediction.

In [None]:
metric = load_metric("accuracy") 

def compute_metrics(eval_pred):
  logits, labels = eval_pred #Model output split into its two constituent parts, logits and labels
  predictions = np.argmax(logits, axis=-1) #Choosing logit value that is max for each prediction 
  return metric.compute(predictions=predictions, references=labels) #Using our preloaded accuracy metric to compute accuracy with our predictions and labels.


  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

#Here we are setting the arguments for the trainer function. 

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],#Telling the model what to train with
    eval_dataset=tokenized_ds["test"],#And what to evaluate with
    compute_metrics=compute_metrics #Our compute metrics function
)

#Training the model.

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5329
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2001
  Number of trainable parameters = 108311810


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5328,0.48246,0.804729
2,0.4573,0.471881,0.806918
3,0.3278,0.53099,0.820928


Saving model checkpoint to bert-finetune/checkpoint-500
Configuration saved in bert-finetune/checkpoint-500/config.json
Model weights saved in bert-finetune/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2284
  Batch size = 8
Saving model checkpoint to bert-finetune/checkpoint-1000
Configuration saved in bert-finetune/checkpoint-1000/config.json
Model weights saved in bert-finetune/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****


TrainOutput(global_step=2001, training_loss=0.4308774352728993, metrics={'train_runtime': 1809.6609, 'train_samples_per_second': 8.834, 'train_steps_per_second': 1.106, 'total_flos': 4206356442040320.0, 'train_loss': 0.4308774352728993, 'epoch': 3.0})

#Here we are converting the test data into a dataset object so we can make predictions with this dataset using our fine-tuned model.

In [None]:
ts_df1 = ts_df[['text', 'location']]
ts_ds = Dataset.from_pandas(ts_df1)

In [None]:
ts_ds

Dataset({
    features: ['text', 'location'],
    num_rows: 3263
})

In [None]:
tokenized_tds = ts_ds.map(tokenize_func, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
predictions = trainer.predict(tokenized_tds)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, location. If text, location are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3263
  Batch size = 8


In [None]:
predictions.predictions

array([[-2.2305498 ,  1.6319425 ],
       [-2.199341  ,  1.6803197 ],
       [-2.2218883 ,  1.7034279 ],
       ...,
       [-2.273435  ,  1.8698342 ],
       [-2.323105  ,  1.8679914 ],
       [ 0.11133587, -0.01283309]], dtype=float32)

In [None]:
p1 = np.argmax(predictions.predictions, axis=1)

In [None]:
p1

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
ts_df['target'] = p1

In [None]:
ts_df

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0
3259,10865,,,Storm in RI worse than last hurricane. My city...,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,1


In [None]:
ts_df_final = ts_df[['id','target']]

In [None]:
ts_df_final.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
ts_df_final.to_csv('out.zip', index=False,
          compression=compression_opts)  