<a href="https://colab.research.google.com/github/darkknight314/NLP-Disaster-Tweet-Detection/blob/master/NLP_with_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
cd /content/drive/My Drive/NLP Disaster Tweet

/content/drive/My Drive/NLP Disaster Tweet


In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader

In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [5]:
train_data = pd.read_csv("train.csv")
train_text, train_labels = train_data.text.values, train_data.target.values
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

input_ids = []
attention_masks = []
for sentence in train_text:
    encoded_dict = tokenizer.encode_plus(sentence, 
                                           max_length=200,
                                           pad_to_max_length=True,
                                           return_attention_mask=True, 
                                           return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids)
attention_masks = torch.cat(attention_masks)
labels = torch.tensor(train_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [6]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9*len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print("Train size: ", train_size)
print("Validation size: ", val_size)

Train size:  6851
Validation size:  762


In [7]:
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, 
                          batch_size = BATCH_SIZE, 
                          shuffle=True, 
                          num_workers =2)
val_loader = DataLoader(val_dataset, 
                          batch_size = BATCH_SIZE, 
                          shuffle=False, 
                          num_workers =2)

In [8]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels = 2,
                                                            output_hidden_states = False,
                                                            output_attentions = False)
model.to(device)
optimizer = AdamW(model.parameters(), lr = 2e-5)
EPOCHS = 2
total_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [9]:
def calculate_accuracy(logits, labels):
    logits = logits.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    return np.sum(np.argmax(logits, axis = 1)==labels)

In [11]:
for epoch in range(EPOCHS):
    print("TRAINING EPOCH : ",epoch)
    model.train()
    total_train_loss = 0.0
    for step, batch in enumerate(train_loader):
        if step%10==0 and step != 0:
            print("Batch number: ",step, "Avg training loss: ", total_train_loss/step)
        b_input = batch[0].to(device)
        b_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        loss, logits = model(b_input, 
                             attention_mask = b_mask,
                             labels = b_labels)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss/len(train_loader)
    print("Avg training loss: ", avg_train_loss)

    model.eval()
    total_eval_loss = 0.0
    eval_running_accuracy = 0.0
    base_accuracy = 0.0
    for step, batch in enumerate(val_loader):
        with torch.no_grad():
            b_input = batch[0].to(device)
            b_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            base_accuracy += torch.sum(b_labels).item()
            loss, logits = model(b_input, 
                             attention_mask = b_mask,
                             labels = b_labels)
            
            total_eval_loss +=loss.item()

            eval_running_accuracy += calculate_accuracy(logits, b_labels)
    print("Avg Eval Loss: ",total_eval_loss/len(val_loader))
    print("Base Accuracy by ZeroR: ", max(base_accuracy/val_size,1-base_accuracy/val_size))
    print("Eval accuracy: ", eval_running_accuracy/val_size)



TRAINING EPOCH :  0
Batch number:  10 Avg training loss:  0.24492996782064438
Batch number:  20 Avg training loss:  0.24366549886763095
Batch number:  30 Avg training loss:  0.24696029697855312
Batch number:  40 Avg training loss:  0.25910961274057626
Batch number:  50 Avg training loss:  0.2553685140609741
Batch number:  60 Avg training loss:  0.2562888233611981
Batch number:  70 Avg training loss:  0.2598535322717258
Batch number:  80 Avg training loss:  0.26704302644357086
Batch number:  90 Avg training loss:  0.2595378298726347
Batch number:  100 Avg training loss:  0.2615086667239666
Batch number:  110 Avg training loss:  0.25761914469979025
Batch number:  120 Avg training loss:  0.2606455112497012
Batch number:  130 Avg training loss:  0.2587973081148588
Batch number:  140 Avg training loss:  0.2630460672080517
Batch number:  150 Avg training loss:  0.26332449227571486
Batch number:  160 Avg training loss:  0.2655961964279413
Batch number:  170 Avg training loss:  0.2674216949764