### Creating Embedding Matrix

Sentence: The cat sat on the mat.

```python
embed = nn.Embedding(vocab_size, dimension_size) # Declaration of Embedding Matrix
cat_mat_embed = nn.Embedding(5, 2)
cat_tensor = Tensor([1])
cat_mat_embed.forward(cat_tensor)

> tensor([[ 1.7793, -0.3127]], grad_fn=<EmbeddingBackward>)
```

Note: Pandas uses C based Csv Parser and may end up throwing error like utf-8 can't decode error. We can switch to python based CSV parser by setting
engine parameter in pandas .read_csv as 'python'

In [11]:
import torchtext
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

In [12]:
stop_words = set(stopwords.words('english'))

### Data preparation

In [13]:
LABEL = data.LabelField()
TEXT = data.Field(tokenize='spacy', lower=True)

In [14]:
PATH = '/home/mayur/Desktop/Kaggle Notebooks/NLP-getting-started/data/'

In [15]:
fields = {'text': ('text', TEXT),'target': ('label', LABEL)}

In [16]:
train = data.TabularDataset(
        path=PATH+"train.json",
        format="json",
        fields=fields,
        skip_header=False)

In [17]:
valid = data.TabularDataset(
        path=PATH+"valid.json",
        format="json",
        fields=fields,
        skip_header=False)

In [18]:
ID = data.Field()
test_fields = {'text': ('text', TEXT),'id': ('id', ID)}

In [19]:
test = data.TabularDataset(
        path=PATH+"test.json",
        format="json",
        fields=test_fields,
        skip_header=False)

In [20]:
vars(test.examples[2354])

{'text': ['ebola',
  'case',
  'possible',
  'home',
  'quarantined',
  'alabama',
  'officials'],
 'id': ['7875']}

In [21]:
TEXT.build_vocab(train,max_size = 15000)

LABEL.build_vocab(train)


In [22]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [23]:
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, valid, None),batch_sizes=(64, 128, 0),
                                                            sort_key=lambda x: len(x.text), device=device)

In [24]:
TEXT.build_vocab(test)
ID.build_vocab(test)
test_iter = data.Iterator(dataset=test, device=device, batch_size=128, sort=False)

### LSTM Model

In [45]:
class LSTM_NET(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 1)
        
    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds
    
model = LSTM_NET(100, 300, 15002)
model.to(device)

LSTM_NET(
  (embedding): Embedding(15002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=1, bias=True)
)

In [46]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002)
criterion = nn.BCEWithLogitsLoss()

In [47]:
def train(epochs, model, optimizer, criterion, train_iter, val_iter):
    for epoch in range(1, epochs+1):
        training_loss=0.0
        validation_loss=0.0
        model.train()
        for batch_id, batch in enumerate(train_iter):
            optimizer.zero_grad()
            predict = model(batch.text)
            batch.label = batch.label.to(torch.float32)
            loss = criterion(predict.squeeze(1), batch.label)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * batch.text.size(0)
        training_loss /=len(train_iter)
        
        model.eval()
        for batch_id, batch in enumerate(val_iter):
            predict = model(batch.text)
            batch.label = batch.label.to(torch.float32)
            loss = criterion(predict.squeeze(1), batch.label)
            validation_loss += loss.data.item() * batch.text.size(0)
            
        validation_loss /= len(val_iter)
        if epoch % 10 == 0:
            print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, validation_loss))


In [48]:
train(epochs=100, model=model, optimizer=optimizer, criterion=criterion, train_iter=train_iter, val_iter=val_iter)

In [49]:
torch.save(model.state_dict(), 'tweet_disaster.pt')

In [89]:
model.to('cpu')
def classify_tweet(tweet):
    categories = {0: "Negative", 1:"Positive"}
    processed = TEXT.process([TEXT.preprocess(tweet)])
    return categories[model(processed).argmax().item()]

### Data Augmentation

>**Random Insertion** -  random insertion technique looks at a sentence and then randomly inserts synonyms of existing nonstop-words into the sentence n times. Assuming you have a way of getting a synonym of a word and a way of eliminating stop-words (common words such as and, it, the, etc.), shown, but not implemented, in this function via get_synonyms() and get_stopwords(), an implementation of this would be as follows:

```python
def random_insertion(sentence,n):
    words = remove_stopwords(sentence)
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(randrange(len(sentence)+1), new_synonym)
    return sentence
         
```

    An example of this in practice where it replaces cat could look like this:

    The cat sat on the mat
    The cat mat sat on feline the mat

>**Random Deletion** - As the name suggests, random deletion deletes words from a sentence. Given a probability parameter p, it will go through the sentence and decide whether to delete a word or not based on that random probability:

```python
 def random_deletion(words, p=0.5):
    if len(words) == 1:
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    if len(remaining) == 0:
        return [random.choice(words)]
    else
        return remaining
```
    The implementation deals with the edge cases—if there’s only one word, the technique returns it; and if we end up deleting all the words in the sentence, the technique samples a random word from the original set.

>**Random Swap** - The random swap augmentation takes a sentence and then swaps words within it n times, with each iteration working on the previously swapped sentence. Here’s an implementation:

```python
 def random_swap(sentence, n=5):
    length = range(len(sentence))
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]
    return sentence

```
    We sample two random numbers based on the length of the sentence, and then just keep swapping until we hit n.

    The techniques in the EDA paper average about a 3% improvement in accuracy when used with small amounts of labeled examples (roughly 500). If you have more than 5,000 examples in your dataset, the paper suggests that this improvement may fall to 0.8% or lower, due to the model obtaining better generalization from the larger amounts of data available over the improvements that EDA can provide.
