<a href="https://colab.research.google.com/github/MathBorgess/into_pytorch/blob/master/disaster_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"matheusborgess","key":"c60158f16484abb1f41fbffa330bcc22"}'}

In [2]:
%pip install -q kaggle
%pip install transformers
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c nlp-getting-started

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import zipfile
with zipfile.ZipFile("nlp-getting-started.zip","r") as zip_ref:
    zip_ref.extractall("data")

In [4]:
import pandas as pd
train_set = pd.read_csv('data/train.csv')
train_set.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
import torch
from torch import nn
from torch import optim
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

In [7]:
class DisasterAnalysisDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.loc[index, 'text'])
        label = self.data.loc[index, 'target']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.float)
        }

batch_size = 128

dataset = DisasterAnalysisDataset(train_set, tokenizer, max_input_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [10]:
bert = BertModel.from_pretrained('bert-base-uncased')

class DisasterAnalysisModel(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(DisasterAnalysisModel,self).__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        #text = [batch size, sent len]

        with torch.no_grad():
            embedded = self.bert(text)[0]

        #embedded = [batch size, sent len, emb dim]

        _, hidden = self.rnn(embedded)

        #hidden = [n layers * n directions, batch size, emb dim]

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])

        #hidden = [batch size, hid dim]

        output = self.out(hidden)

        #output = [batch size, out dim]

        return output

In [11]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
model = DisasterAnalysisModel(bert,
                            HIDDEN_DIM,
                            OUTPUT_DIM,
                            N_LAYERS,
                            BIDIRECTIONAL,
                            DROPOUT).to(DEVICE)

In [12]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(DEVICE)
def binary_accuracy(preds, y):
    correct = (torch.round(torch.sigmoid(preds)) == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [15]:
def train(model, iterator, optimizer: optim.Adam, criterion: nn.BCEWithLogitsLoss, device):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch['input_ids'].to(device)).squeeze(1)

        loss = criterion(predictions, batch['labels'].to(device))

        acc = binary_accuracy(predictions, batch['labels'].to(device))

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, device):
    model.eval()
    predictions = []
    with torch.no_grad():

        for batch in iterator:
            predictions.append(model(batch['input_ids'].to(device)).squeeze(1))

    return predictions

In [None]:
import time
EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, dataloader, optimizer, criterion, DEVICE)

    end_time = time.time()

    print(f'Epoch: {epoch+1:02} | Epoch Time: {end_time - start_time}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

In [19]:
def predict_sentiment(model, tokenizer, max_length, sentence, device):
    model.eval()
    tokens = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length= max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )['input_ids'].flatten()
    tensor = torch.LongTensor(tokens).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [34]:
predict_sentiment(model, tokenizer, max_input_length, "fire, earthquake, forest", DEVICE)

0.9675390124320984