In [1]:
import numpy as np
import pandas as pd
import random
import os 
import warnings
warnings.filterwarnings('ignore')


for dir_, _, file in os.walk(r'../input/nlp-getting-started'):
    for file_name in file:
        print(os.path.join(dir_, file_name))
        
        
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')


# --------------------------------- Text Preprocessing -------------------------------------------

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from collections import Counter
import re

stop_words = set(stopwords.words('english'))


def remove_punctuation(text):
    text = ' '.join([word for word in text.split() if word not in string.punctuation])
    return text

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text
    



from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N': wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])


def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

def remove_urls(text):
#     url = re.compile(r'https?://\S+|www\.\S+')
    text = re.sub(r'https?://[\w]*[\.\\\/\w\d]*', '', text)
#     text = url.sub(r'', text)
    text = re.sub(r'[\x80-\xFF]+', '', text)
    return text

def remove_symbols(text):
    symbols = re.compile(r'[@#?\'\":\[\]]*')
    return re.sub(r'[@#?\'\":\[\]]*', '', text)

def create_vocab(data):
    vocab = []
    for words in data['lemmatized_text'].values:
        for word in words.split():
            vocab.append(word)
    vocab_dict = Counter(vocab)
    freq_words = [word for word, counts in vocab_dict.most_common(50)]
    rare_word = [word for word, counts in vocab_dict.most_common()[:-50:-1]]
    return {'vocab_dict':vocab_dict
           ,'freq_words':freq_words
           ,'rare_word': rare_word}

# freq_words = []
# for word, counts in vocab_dict.most_common(50):
#     freq_words.append(word)

# rare_word = []
# rare = vocab_dict.most_common()[:-50:-1]
# for word, counts in rare:
#     rare_word.append(word)


def frequent_words(sentence, freq):
    
    text = ' '.join([word for word in sentence.split() if word not in freq])
    return text
    
def rare_words_removal(sentence, rare):
    text = ' '.join([word for word in sentence.split() if word not in rare])
    return text


def text_preprocessing(train):
    data = train.copy()
    data['text'] = data['text'].str.lower()
    data['text_rm_punctuation'] = data['text'].apply(lambda x: remove_punctuation(x))
    data['text_rm_stopwords'] = data['text_rm_punctuation'].apply(lambda x: remove_stopwords(x))
    data['lemmatized_text'] = data['text_rm_stopwords'].apply(lambda x: lemmatize_words(x))
    data['lemmatized_text'] = data['lemmatized_text'].apply(lambda x: remove_html_tags(x))
    data['lemmatized_text'] = data['lemmatized_text'].apply(lambda x: remove_urls(x))
    data['lemmatized_text'] = data['lemmatized_text'].apply(lambda x: remove_symbols(x))
   
    vocab = []
    for words in data['lemmatized_text'].values:
        for word in words.split():
            vocab.append(word)
    vocab_dict = Counter(vocab)
    freq_words = [word for word, counts in vocab_dict.most_common(50)]
    rare_word = [word for word, counts in vocab_dict.most_common()[:-50:-1]]
    freq_rare = {'vocab_dict':vocab_dict
           ,'freq_words':freq_words
           ,'rare': rare_word}


    freq_words  = freq_rare['freq_words']
    rare_words =  freq_rare['rare']
    
    print(type(rare_words))
    
    data['lemmatized_text'] = data['lemmatized_text'].apply(lambda x: rare_words_removal(x, rare_words))
    data['lemmatized_text'] = data['lemmatized_text'].apply(lambda x: frequent_words(x, freq_words))
    
    return data
    
    
    
train_data  = text_preprocessing(train)
test_data = text_preprocessing(test)
train_data


../input/nlp-getting-started/sample_submission.csv
../input/nlp-getting-started/train.csv
../input/nlp-getting-started/test.csv
<class 'list'>
<class 'list'>


Unnamed: 0,id,keyword,location,text,target,text_rm_punctuation,text_rm_stopwords,lemmatized_text
0,1,,,our deeds are the reason of this #earthquake m...,1,our deeds are the reason of this #earthquake m...,deeds reason #earthquake may allah forgive us,deed reason earthquake may allah forgive
1,4,,,forest fire near la ronge sask. canada,1,forest fire near la ronge sask. canada,forest fire near la ronge sask. canada,forest near la ronge sask. canada
2,5,,,all residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,residents asked 'shelter place' notified offic...,resident ask shelter place notify officers. ev...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...","13,000 people receive #wildfires evacuation or...","13,000 receive wildfires evacuation order"
4,7,,,just got sent this photo from ruby #alaska as ...,1,just got sent this photo from ruby #alaska as ...,got sent photo ruby #alaska smoke #wildfires p...,sent photo ruby alaska smoke wildfires pour sc...
...,...,...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,two giant cranes holding a bridge collapse int...,two giant cranes holding bridge collapse nearb...,two giant crane hold bridge collapse nearby
7609,10870,,,@aria_ahrary @thetawniest the out of control w...,1,@aria_ahrary @thetawniest the out of control w...,@aria_ahrary @thetawniest control wild fires c...,aria_ahrary thetawniest control wild even nort...
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,m1.94 [01:04 utc]?5km volcano hawaii. http://t...,m1.94 0104 utc5km volcano hawaii.
7611,10872,,,police investigating after an e-bike collided ...,1,police investigating after an e-bike collided ...,police investigating e-bike collided car littl...,investigate e-bike collide car little portugal...


In [2]:
vocab = []
for words in train_data['lemmatized_text'].values:
    for word in words.split():
        vocab.append(word)
vocab_dict = Counter(vocab)

In [3]:
len(vocab_dict.most_common())

19046

In [4]:
unfrequent = []
for i in range(99):
    unfrequent.append(vocab_dict.most_common()[-100:-1][i][0])
# vocab_dict.most_common()[-100:-1]

In [5]:
unfrequent_strings = ' '.join([i for i in unfrequent])
unfrequent_strings

'metal) slower squeaver hangin watchin septic captainn_morgan friggin destiel (read description) gazette pedals. aqua... memenaar kindof restrospect _pokemoncards_ icequeenfroslas artectura pop2015 n36 florence 1979 gimp newave progressives. live-streaming girlthatsrio worse. georgefoster72 edmund fitzgerald blockage woodward northbound davison m.s. shoalstraffic blinker -))) misscharleywebb indeed! by! greer amazondeals skylanders 4.53% ($0.45) $9.49 $9.94 ralph titortau lynch hi-larious realtime. fact-checking ombudsmanship. raineishida lol...im nervous takeaway magnificent (vice news) victims janeenorman probability kuala lumpur washed plot! *rolling eyes* ajabrown ministersays buzzfeed first-ever 777 239 najibrazak malaysiaairlines yahoonewsdigest julian_lage grantgordy rossmartin7 pastie industrial stare (costing $100 apiece) liv oliviaapalmerr thatswhatfriendsarefor audi land. kunstler residualincome'

In [6]:
import torch
import torchtext
glove_emb = torchtext.vocab.GloVe(name = '6B', dim = 200)

print(glove_emb.vectors.size())
print(glove_emb.stoi['good'])
print(glove_emb.itos[219])


.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                           
100%|█████████▉| 399999/400000 [00:31<00:00, 12555.70it/s]


torch.Size([400000, 200])
219
good


In [7]:
seed_no = 33

os.environ['PYTHONHASHSEED'] = str(seed_no)
random.seed(seed_no)
np.random.seed(seed_no)
torch.manual_seed(seed_no)
torch.cuda.manual_seed_all(seed_no)
torch.backends.cudnn.deterministic = True

In [8]:
# get the indices of the lemmatized text column

def glove_index(text, glove_emb):
    idx = [glove_emb.stoi[word] for word in text.split() if word in glove_emb.stoi]
    return idx
    

In [9]:
train_data['glove_index'] = train_data['lemmatized_text'].apply(lambda x: glove_index(x, glove_emb))
test_data['glove_index'] = test_data['lemmatized_text'].apply(lambda x: glove_index(x, glove_emb))

In [10]:
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from keras.preprocessing import text, sequence 
MAX_LEN = 150

class TextDatset(data.Dataset):
    def __init__(self, text, lens, text_id, y = None):
        self.text = text
        self.lens = lens
        self.text_id = text_id
        self.y = y
    
    def __len__(self):
        return len(self.lens)
    
    def __getitem__(self, idx):
        if self.y is None:
            return self.text[idx], self.lens[idx], self.text_id[idx]
        return self.text[idx], self.lens[idx], self.y[idx], self.text_id[idx]
    

class Collator(object):
    def __init__(self, test = False, percentile = 100):
        self.test = test
        self.percentile = percentile
        
    def __call__(self, batch):
        
        global MAX_LEN
        
        if self.test:
            texts, lens, text_id = zip(*batch)
        else:
            texts, lens, target, text_id = zip(*batch)
            
        lens = np.array(lens)
        
#         max_len = min(int(np.percentile(lens, self.percentile)), MAX_LEN)
        max_len = max(lens)
        
#         print(f'the sequence length is {max_len}')
        texts = torch.tensor(sequence.pad_sequences(texts, maxlen = max_len), dtype = torch.long)
        
        if self.test:
            return texts, text_id
        
        return texts, torch.tensor(target, dtype = torch.long), text_id
#         return 
    
            

In [11]:
input_size = 200
num_layers = 3
hidden_size = 256
num_classes = 2
batch_size = 64
num_epochs = 3

In [12]:
def accuracy(pred, target):
    pred = pred.max(1)[1]
    correct = (pred == target).long().sum().item()
    acc = correct / len(target)
#     return acc, correct, len(target)
    return acc

# accuracy(pred, target)

In [13]:
class Attention(nn.Module):
    def __init__(self, features, batch_size):
    
        # features and sequences are nothing but the input size and sequence length
        super(Attention, self).__init__()
        self.batch_size = batch_size
        self.features = features
#         self.sequence = sequence
#         self.bias = True
        
        self.W = torch.zeros(features, 1)
        
#         if bias:
#             self.b = torch.zeros(sequence , 1)
        
        
        
    def forward(self, x, sequence, bias = True, msg = False):
        
        
#         print(x.contiguous().view(-1, self.features).shape, self.W.shape)
        eij = torch.mm(x.contiguous().view(-1, self.features), self.W).view(-1, sequence)
        
        if msg:
            print(f'---The shape of the input given to attention layer {x.shape} \n', )
            print(f'---The shape of the input after multiplying it with weights {eij.shape} \n')
        if bias:
            eij = eij + torch.zeros(sequence)
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if msg: 
            print(f'---The shape of the input after tanh and exponentiating {a.shape}')
        a = a / (torch.sum(a, 1).view(x.size(0), -1) + 1e-10)
        
        weighted_input = x * torch.unsqueeze(a, -1)
        if msg:
            print(f'---The shape of the attention {a.shape} \n')
            print(f'---The shape of the final output from the attention layer {weighted_input.shape} and after summing {torch.sum(weighted_input, 1).shape} \n')
        return torch.sum(weighted_input, 1)

In [14]:

class textRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, batch_size, model_type = 'RNN'):
        super(textRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.batch_size = batch_size
        self.model_type = model_type
        self.embedding = nn.Embedding.from_pretrained(glove_emb.vectors, freeze = True)
        
        if model_type == 'RNN':
            self.rnn = nn.RNN(input_size, hidden_size, num_layers, bidirectional = True, batch_first = True)
        else:
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers, bidirectional = True, batch_first = True)
         
        self.attention = Attention(2*hidden_size, batch_size)
#         self.ff = nn.Linear(2*hidden_size, 50)
#         self.ff1 = nn.Linear(50, num_classes)
        self.ff = nn.Linear(2*hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x, msg = True):
        
        h0 = torch.zeros(2*num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(2*num_layers, x.size(0), self.hidden_size)
        #x should be of shape (batch_size, L, input_size)
        
        if msg:
            print(f'The dimension of hidden layers and inputs before passing to the model \n')
            print(f'...... \n')
            print(f'---Input shape {x.shape} \n')
            print(f'---hidden layer shape {h0.shape} \n')
        
        x = self.embedding(x)
#         x = x.permute(1, 0, 2)
        if self.model_type == 'RNN':
            output, hidden_state = self.rnn(x, h0)
        else:
            output, (hidden_state, cell_state) = self.rnn(x, (h0, c0))
        
        #output is of shape (batch_size, L, hidden_size)
        #final hidden state of size (num_layers, batch_size, hidden_size)
        
       
        #out = self.ff(final_hidden_state.view(x.size(1), -1))
        att = self.attention(output, output.size(1), msg = False)
        out = self.ff(att)
#         out = self.dropout(self.ff(output[:, -1, :]))
        out = (self.dropout(out))
        
        
        
        if msg:
            print(f'The dimension of final hidden state and output state after passing to the model \n')
            print(f'....... \n')
            print(f'---Input shape {x.shape} \n')
            print(f'---Final hidden state {hidden_state.shape} \n')
            print(f'---output of the last layer at each time step {output.shape} \n')
            print(f'---Final output shape {out.shape} \n')

        
        return out 

In [15]:
train_lengths = train_data['glove_index'].apply(lambda x: len(x))
train_text = train_data['glove_index']
target_train = train_data['target']
train_id = train_data['id']


In [16]:
from sklearn.model_selection import StratifiedKFold

x_train = pd.DataFrame({'glove_index' : train_data['glove_index']
                            , 'train_id': train_id
                            , 'train_lengths': train_lengths
                            })
y_train = train_data['target']



In [17]:
sf = StratifiedKFold(n_splits = 5, random_state = 3, shuffle = True)

In [18]:
def training(model, train, train_target, valid, valid_target, num_epochs, validate = True, learning_rate = 0.005):

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    
    print('Training...')
    
    #for fold, (train_ind, val_ind) in enumerate(sf.split(x_train, y_train)):
    
    train_text, train_lengths, train_id, target_train = train['glove_index'].reset_index(drop = True), train['train_lengths'].reset_index(drop = True), train['train_id'].reset_index(drop = True), train_target.reset_index(drop = True)
    
    train_collate = Collator(percentile = 100)
    train_dataset = TextDatset(train_text, train_lengths, train_id, target_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 64, collate_fn = train_collate)
    
    valid_text, valid_lengths, valid_id, valid_target = valid['glove_index'].reset_index(drop = True), valid['train_lengths'].reset_index(drop = True), valid['train_id'].reset_index(drop = True), valid_target.reset_index(drop = True)


    valid_dataset = TextDatset(valid_text, valid_lengths, valid_id, valid_target)
    valid_collate = Collator(percentile = 100, test = False)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = 64, collate_fn = valid_collate)


    for epoch in range(num_epochs):
        acc_train = 0
        loss_train = 0
        model.train()
        for batch, (data, target, data_id) in enumerate(train_loader):

            pred = model(data, msg = False)

            optimizer.zero_grad()
            loss = criterion(pred, target)



            loss.backward()

            optimizer.step()

            loss_train += criterion(pred, torch.tensor(target, dtype = torch.long))
            acc_train += accuracy(pred, target)

        
        print(f'Epoch {epoch+1}  |  loss {loss_train / batch}  |  accuracy {acc_train / batch} | {batch}')
        
#         model.eval()
        
    if validate:
#             print('Validating....')
        val_loss = 0
        val_acc = 0 

        for batch, (data, target, data_id) in enumerate(valid_loader):

            pred = model(data, msg = False).detach()
            val_loss += criterion(pred, torch.tensor(target, dtype = torch.long))
            val_acc += accuracy(pred, target)

        
    print(f'Validating...')
    print(f'validation loss {val_loss / batch}  |  accuracy {val_acc / batch} \n')

In [19]:
train_collate = Collator(percentile = 100)
train_dataset = TextDatset(train_text, train_lengths, train_id, target_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 64, collate_fn = train_collate)
data, target, data_id = next(iter(train_loader))

model = textRNN(input_size, hidden_size, num_layers, num_classes, batch_size, model_type = 'LSTM')
model(data, msg = True)


The dimension of hidden layers and inputs before passing to the model 

...... 

---Input shape torch.Size([64, 13]) 

---hidden layer shape torch.Size([6, 64, 256]) 

The dimension of final hidden state and output state after passing to the model 

....... 

---Input shape torch.Size([64, 13, 200]) 

---Final hidden state torch.Size([6, 64, 256]) 

---output of the last layer at each time step torch.Size([64, 13, 512]) 

---Final output shape torch.Size([64, 2]) 



tensor([[-0.0078,  0.0374],
        [-0.0132,  0.0364],
        [-0.0040,  0.0349],
        [-0.0078,  0.0000],
        [-0.0078,  0.0291],
        [-0.0075,  0.0391],
        [-0.0000,  0.0362],
        [-0.0105,  0.0351],
        [-0.0101,  0.0345],
        [-0.0112,  0.0364],
        [-0.0091,  0.0384],
        [-0.0071,  0.0364],
        [-0.0079,  0.0406],
        [-0.0080,  0.0375],
        [-0.0091,  0.0398],
        [-0.0107,  0.0346],
        [-0.0105,  0.0377],
        [-0.0112,  0.0376],
        [-0.0112,  0.0371],
        [-0.0111,  0.0370],
        [-0.0111,  0.0370],
        [-0.0096,  0.0361],
        [-0.0100,  0.0365],
        [-0.0112,  0.0363],
        [-0.0000,  0.0370],
        [-0.0095,  0.0000],
        [-0.0105,  0.0000],
        [-0.0108,  0.0000],
        [-0.0110,  0.0369],
        [-0.0103,  0.0357],
        [-0.0111,  0.0000],
        [-0.0092,  0.0385],
        [-0.0089,  0.0357],
        [-0.0080,  0.0383],
        [-0.0000,  0.0000],
        [-0.0000,  0

In [20]:
# data_id = list(data_id)
# np.array([train_data[train_data['id'].isin(data_id)].glove_index.apply(lambda x: len(x))]).max()

In [21]:
# KFOLD validation 

for fold, (train_ind, val_ind) in enumerate(sf.split(x_train, y_train)):
    train, train_target = x_train.iloc[train_ind, :], y_train[train_ind]
    valid, valid_target = x_train.iloc[val_ind, :], y_train[val_ind]
    
    print(f'Fold{fold + 1} \n')
    
    
    model = textRNN(input_size, hidden_size, num_layers, num_classes, batch_size, model_type = 'LSTM')
    
    training(model, train, train_target, valid, valid_target, num_epochs, validate = True, learning_rate = 0.001)
                                    

    

Fold1 

Training...
Epoch 1  |  loss 0.6376305222511292  |  accuracy 0.6891776315789473 | 95
Epoch 2  |  loss 0.5305585265159607  |  accuracy 0.7728947368421052 | 95
Epoch 3  |  loss 0.4933263659477234  |  accuracy 0.7931907894736843 | 95
Validating...
validation loss 0.5136802792549133  |  accuracy 0.8141650682011935 

Fold2 

Training...
Epoch 1  |  loss 0.6294810771942139  |  accuracy 0.6954934210526317 | 95
Epoch 2  |  loss 0.549085259437561  |  accuracy 0.7626973684210526 | 95
Epoch 3  |  loss 0.4980575442314148  |  accuracy 0.7908881578947369 | 95
Validating...
validation loss 0.5315901637077332  |  accuracy 0.802949168797954 

Fold3 

Training...
Epoch 1  |  loss 0.6233391165733337  |  accuracy 0.7126973684210526 | 95
Epoch 2  |  loss 0.538622260093689  |  accuracy 0.765921052631579 | 95
Epoch 3  |  loss 0.49883612990379333  |  accuracy 0.7890789473684211 | 95
Validating...
validation loss 0.506109893321991  |  accuracy 0.816389599317988 

Fold4 

Training...
Epoch 1  |  loss 0.

In [22]:
#testing

test_lengths = test_data['glove_index'].apply(lambda x: len(x))
test_text = test_data['glove_index']
test_id = test_data['id']


test_dataset = TextDatset(test_text, test_lengths, test_id)
test_collate = Collator(percentile = 100, test = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 64, collate_fn = test_collate)



ids = []
out = []

model.eval()

for batch, (data, text_id) in enumerate(test_loader):
    test_out = model(data, msg = False)
    ids.extend(text_id)
    out.extend(test_out.max(dim = 1)[1].detach().numpy())
    

In [23]:
sub_file = pd.DataFrame({'id' : ids, 'target': out })
sub_file.to_csv('submission.csv', index = False)