In [1]:
import torchtext
from torchtext import data
from torchtext.data import Field
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import preprocessor as p
import torch.optim as optim
from cnvrt_csv_to_json import csv_to_json
from transformers import BertTokenizer
from transformers import BertTokenizer, BertModel

In [2]:
nlp = spacy.load('en')

In [3]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED)

In [4]:
stopwords = set(stopwords.words('english'))

In [5]:
stopwords.add("'s")
stopwords.add("...")

In [6]:
data_set = pd.read_csv('train.csv')

In [7]:
data_set[60:75]

Unnamed: 0,id,keyword,location,text,target
60,85,ablaze,,Revel in yours wmv videos by means of mac fare...,0
61,86,ablaze,Inang Pamantasan,Progressive greetings!\n\nIn about a month stu...,0
62,89,ablaze,Twitter Lockout in progress,Rene Ablaze &amp; Jacinta - Secret 2k13 (Falle...,0
63,91,ablaze,"Concord, CA",@Navista7 Steve these fires out here are somet...,1
64,92,ablaze,"Calgary, AB",#NowPlaying: Rene Ablaze &amp; Ian Buff - Magn...,0
65,93,ablaze,Birmingham,@nxwestmidlands huge fire at Wholesale markets...,1
66,95,ablaze,San Francisco,@ablaze what time does your talk go until? I d...,0
67,96,accident,CLVLND,'I can't have kids cuz I got in a bicycle acci...,0
68,97,accident,"Nashville, TN",Accident on I-24 W #NashvilleTraffic. Traffic ...,1
69,98,accident,"Santa Clara, CA",Accident center lane blocked in #SantaClara on...,1


In [8]:
def impute_keyword(x):
    token_pos = {}
    doc = nlp(x)
    for token in doc:
        token_pos[token.pos_] = token
    #print(token_pos)
    if "NOUN" in token_pos.keys():
        if len(token_pos["NOUN"]) > 2:
            return token_pos["NOUN"]
    elif "ADJ" in token_pos.keys():
        if len(token_pos["ADJ"]) > 2:
            return token_pos["ADJ"]
    elif "PROPN" in token_pos.keys():
        if len(token_pos["PROPN"]) > 2:
            return token_pos["PROPN"]
    elif "VERB" in token_pos.keys():
        if len(token_pos["VERB"]) > 2:
            return token_pos["VERB"]
    elif "ADV" in token_pos.keys():
        if len(token_pos["ADV"]) > 2:
            return token_pos["ADV"]
    
    else:
        return "unknown"

In [9]:
def text_preprocessing(x):
    x = str(x)
    
    #cleaning url, @name, emojis, smiley etc
    text = p.clean(x)
    
    #punctation removal
    no_punct = [word for word in word_tokenize(text) if word not in string.punctuation]
    
    #stopword removal
    no_sw = set(wrd for wrd in no_punct if wrd not in stopwords)
    
    return " ".join(w.lower() for w in no_sw).strip()

In [10]:
def clean_keyword(x):
    if "%20" in str(x):
        x = x.replace("%20", " ")
    return x

In [11]:
data_set["keyword"] = data_set["keyword"].apply(clean_keyword)

In [13]:
for idx, text in zip(data_set[data_set.keyword.isna()]["text"].index, data_set[data_set.keyword.isna()]["text"]):
    text = text_preprocessing(text)
    keyword = impute_keyword(text)
    data_set["keyword"][idx] = keyword

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
data_set["text"] = data_set["text"].apply(text_preprocessing)

In [15]:
data_set["text"] = data_set["keyword"].astype(str) +" "+ data_set["text"]

In [16]:
data_set[:5500][["text","target"]].to_csv("data/train.csv",index=False)

In [17]:
data_set[5500:][["text","target"]].to_csv("data/valid.csv",index=False)

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer.vocab)

30522

In [19]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [20]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [21]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [22]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [23]:
#nlp = spacy.load('en')

In [24]:
#p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED)

In [26]:
#convert csv to json

In [27]:
csv_to_json("data/train.csv", "data/train.json", ("text", "target"))
csv_to_json("data/valid.csv", "data/valid.json", ("text", "target"))

In [66]:
test_csv = pd.read_csv("data/test.csv")

In [67]:
test_csv["text"] = test_csv["text"].apply(text_preprocessing)

In [68]:
for idx, text in zip(test_csv[test_csv.keyword.isna()]["text"].index, test_csv[test_csv.keyword.isna()]["text"]):
    #text = text_preprocessing(text)
    keyword = impute_keyword(text)
    test_csv["keyword"][idx] = keyword

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [69]:
test_csv["text"] = test_csv["keyword"].astype(str) +" "+ test_csv["text"]

In [71]:
test_csv[["text","id"]].to_csv("data/test.csv",index=False)

In [72]:
csv_to_json("data/test.csv", "data/test.json", ("text", "id"))

### Creating Dataset using TabularDataset from Torchtext

In [108]:
TEXT = Field(
    batch_first=True,
    tokenize = tokenize_and_cut,
    stop_words=stopwords,
    use_vocab=False,
    preprocessing=tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx,         
        )

LABEL = data.LabelField(dtype = torch.float)

train, valid = data.TabularDataset.splits(path='data/',
                                   train='train.json',
                                   validation='valid.json',
                                   skip_header = True,
                                   format='json',
                            fields={
                                'text': ('text', TEXT),
                                'target': ('labels', LABEL)
                                   }
                           )

In [109]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [110]:
#TEXT.build_vocab(tweet, max_size=10000)

In [111]:
#LABEL.build_vocab(tweet)

In [112]:
LABEL.vocab.freqs

Counter({'1': 2298, '0': 3202})

In [113]:
#TEXT.vocab.freqs.pop('') # remove empty string

In [114]:
#train, valid = tweet.split(split_ratio=0.8)

In [115]:
print(vars(train.examples[6]))

{'text': [5320, 7071, 9451, 4542, 2752, 4534, 5169, 7186, 3082, 23624, 24826, 5956, 6076, 5320], 'labels': '1'}


### Convert Dataset into Iterator 

In [43]:
device = torch.device('cuda')

### BucketIterator - 

>**Shuffles the data in iterator**

>**It is recommended that for test data we should use just itertor, since we are evaluating the model.**

In [44]:
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, valid, None),batch_sizes=(128, 128, 0),
                                                            sort_key=lambda x: len(x.text), device=device)

In [45]:
type(train_iter.data()[0].text)

list

In [46]:
type(val_iter.data()[99].text)

list

In [99]:
%%time
test = data.TabularDataset(path='data/test.json', format='json',
                           skip_header=True,
                            fields={
                                'text': ('text', TEXT),
                                }
                           )

CPU times: user 772 ms, sys: 0 ns, total: 772 ms
Wall time: 771 ms


In [100]:
test_iter = data.Iterator(dataset=test, device=device, batch_size=128, sort=False)

## Model Building

In [79]:
bert = BertModel.from_pretrained('bert-base-uncased')

In [80]:
class BERTGRUClassification(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [81]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUClassification(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [82]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [83]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [84]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [85]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [86]:
optimizer = optim.AdamW(model.parameters())

In [87]:
criterion = nn.BCEWithLogitsLoss()

In [88]:
model = model.to(device)
criterion = criterion.to(device)

In [89]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [90]:
model

BERTGRUClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [91]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        acc = binary_accuracy(predictions, batch.labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [92]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.labels)
            
            acc = binary_accuracy(predictions, batch.labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [93]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [94]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 0.564 | Train Acc: 71.42%
	 Val. Loss: 0.491 |  Val. Acc: 75.86%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 0.455 | Train Acc: 80.20%
	 Val. Loss: 0.479 |  Val. Acc: 77.37%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.433 | Train Acc: 81.31%
	 Val. Loss: 0.479 |  Val. Acc: 77.37%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.431 | Train Acc: 81.36%
	 Val. Loss: 0.502 |  Val. Acc: 77.18%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 0.409 | Train Acc: 82.11%
	 Val. Loss: 0.539 |  Val. Acc: 75.66%
Epoch: 06 | Epoch Time: 0m 8s
	Train Loss: 0.401 | Train Acc: 82.96%
	 Val. Loss: 0.502 |  Val. Acc: 77.41%
Epoch: 07 | Epoch Time: 0m 8s
	Train Loss: 0.376 | Train Acc: 83.98%
	 Val. Loss: 0.507 |  Val. Acc: 77.23%
Epoch: 08 | Epoch Time: 0m 8s
	Train Loss: 0.360 | Train Acc: 84.76%
	 Val. Loss: 0.588 |  Val. Acc: 73.69%
Epoch: 09 | Epoch Time: 0m 8s
	Train Loss: 0.343 | Train Acc: 85.58%
	 Val. Loss: 0.558 |  Val. Acc: 74.89%
Epoch: 10 | Epoch Time: 0m 8

In [95]:
model.load_state_dict(torch.load('tut1-model.pt'))

<All keys matched successfully>

In [101]:
test_preds = []

with torch.no_grad():
    for batch in test_iter:
        predictions = model(batch.text).squeeze(1)
        preds = predictions.data.cpu().numpy()
        #preds = preds.data.numpy()
        # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function
        preds = 1 / (1 + np.exp(-preds))
        test_preds.append(preds)

In [104]:
predictions = np.where(np.concatenate(test_preds, axis=0) > 0.4, 1, 0)

In [105]:
test_data = pd.read_csv('data/test.csv')
test_data["target"] = predictions
submission_file = test_data[['id', 'target']]
submission_file.to_csv("bert_txt_Submission_file.csv", index=False)