In [2]:
import torch
from torchtext import data
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy',
                 tokenizer_language='en_core_web_sm')
LABEL = data.LabelField()

In [3]:
train_x_dir = "data/twitter_dataset/tweets_train_tokens.csv"

tweets_df = pd.read_csv(train_x_dir)
tweets_df['label'] = tweets_df['label'].astype('int')

tweets_df

Unnamed: 0,message,label
0,arirang simply kpop kim hyung jun cross ha yeo...,1
1,read politico article donald trump running mat...,1
2,type bazura project google image image photo d...,1
3,fast lerner subpoena tech guy work hillary pri...,1
4,sony reward app like lot female singer non ret...,0
...,...,...
49670,sleep think fuck jordan answer phone tomorrow ...,0
49671,yoga shannon tomorrow morning work day start u...,1
49672,bring dunkin iced coffee tomorrow hero,1
49673,currently holiday portugal come home tomorrow ...,1


In [4]:
tweets_df.dropna(inplace=True)
tweets_df.drop_duplicates(inplace=True)
tweets_df.loc[tweets_df['message'].str.contains('beyonce pearl')]

Unnamed: 0,message,label
9994,beyonce pearl jam ed sheeran coldplay play glo...,2
10770,rt kiss beyonce pearl bedd night wass wife,2


In [5]:
tweets_df['label'].value_counts(normalize=True)

1    0.447492
2    0.396289
0    0.156219
Name: label, dtype: float64

In [6]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler()
X_train_balanced, y_train_balanced = oversampler.fit_resample(np.array(tweets_df['message']).reshape(-1, 1),
                                                             np.array(tweets_df['label']).reshape(-1, 1))
tweets_df_balanced = pd.DataFrame(list(zip([x[0] for x in X_train_balanced], y_train_balanced)), columns=['message', 'label'])
tweets_df_balanced

Unnamed: 0,message,label
0,arirang simply kpop kim hyung jun cross ha yeo...,1
1,read politico article donald trump running mat...,1
2,type bazura project google image image photo d...,1
3,fast lerner subpoena tech guy work hillary pri...,1
4,sony reward app like lot female singer non ret...,0
...,...,...
66406,meet friday shakedown nicki probably,2
66407,omfgomfgomfgomfg black dahlia murder machine h...,2
66408,buy twilight bc lose old copy uncool lame enjo...,2
66409,season longmire premiere tomorrow september ne...,2


In [11]:
X_train, X_test, y_train, y_test = train_test_split(tweets_df_balanced['message'], tweets_df_balanced['label'], test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
valid_data = pd.concat([X_valid, y_valid], axis=1)

train_data['label'].value_counts()
# X_train.shape, X_test.shape, X_valid.shape

1    15970
0    15951
2    15894
Name: label, dtype: int64

In [12]:
train_data.to_csv('data/twitter_dataset/split/train_tweets.csv', index=False)
test_data.to_csv('data/twitter_dataset/split/test_tweets.csv', index=False)
valid_data.to_csv('data/twitter_dataset/split/valid_tweets.csv', index=False)

In [13]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                path = 'data/twitter_dataset/split/',
                train = 'train_tweets.csv',
                test = 'test_tweets.csv',
                validation = 'valid_tweets.csv',
                format='csv',
                skip_header=True,
                fields = [('message', TEXT), ('label', LABEL)]
            )

print(f'Length of training examples: {len(train_data)}')
print(f'Length of testing examples: {len(test_data)}')
print(f'Length of validation examples: {len(valid_data)}')

Length of training examples: 47815
Length of testing examples: 13283
Length of validation examples: 5313


In [14]:
print(vars(train_data.examples[0]))

{'message': ['fall', 'sick', 'day', 'eid', 'miss', 'fun', 'joy', 'eid', 'sad'], 'label': '0'}


The following builds the vocabulary, only keeping the most common max_size tokens.

In [15]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

Why do we only build the vocabulary on the training set? When testing any machine learning system you do not want to look at the test set in any way. We do not include the validation set as we want it to reflect the test set as much as possible.

In [16]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 3


In [17]:
LABEL.vocab.stoi

defaultdict(None, {'1': 0, '0': 1, '2': 2})

Why is the vocab size 25002 and not 25000? One of the addition tokens is the `<unk>` token and the other is a `<pad>` token.
<br>

When we feed sentences into our model, we feed a *batch* of them at a time, i.e. more than one at a time, and all sentences in the batch need to be the same size. Thus, to ensure each sentence in the batch is the same size, any shorter than the longest within the batch are padded.

We can also view the most common words in the vocabulary and their frequencies.



In [18]:
print(TEXT.vocab.freqs.most_common(20))

[('tomorrow', 7570), ('day', 4353), ('like', 3079), ('night', 2987), ('friday', 2796), ('time', 2762), ('sunday', 2677), ('good', 2466), ('come', 2406), ('watch', 2192), ('saturday', 2089), ('game', 1973), ('amp', 1911), ('want', 1911), ('think', 1880), ('monday', 1824), ('know', 1815), ('new', 1791), ('today', 1654), ('play', 1501)]


We can also see the vocabulary directly using either the `stoi` (string to int) or `itos` (int to string) method.

In [19]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'tomorrow', 'day', 'like', 'night', 'friday', 'time', 'sunday', 'good']


We can also check the labels, ensuring 0 is for negative and 1 is for positive.



In [20]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1, '2': 2})


The final step of preparing the data is creating the iterators. We iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.
<br>

We'll use a `BucketIterator` which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.
<br>

We also want to place the tensors returned by the iterator on the GPU (if you're using one). PyTorch handles this using `torch.device`, we then pass this device to the iterator.

In [34]:
BATCH_SIZE = 96

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
                            (train_data, valid_data, test_data),
                            batch_size=BATCH_SIZE,
                            device=device,
                            sort_key = lambda x: len(x.message),
                            sort_within_batch=True
                        )

## Build the Model

The next stage is building the model that we'll eventually train and evaluate.



In [35]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.GRU(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.softmax = nn.Softmax()
        
    def forward(self, text):
        
        # text = [sent_len, batch_size]
        
        embedded = self.embedding(text)
        # embedded = [sent_lence, batch_size, emb_dim]
        
        output, hidden = self.rnn(embedded)
        # output = [sent_len, batch_size, hidden_dim]
        # hidden = [1, batch_size, hidden_dim]
        
        assert torch.equal(output[-1,:, :], hidden.squeeze(0))

#         output = self.fc(hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))
        

We now create an instance of our RNN class.
<br>

The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.
<br>

The embedding dimension is the size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.
<br>

The hidden dimension is the size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.
<br>

The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [36]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 3

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

Let's also create a function that will tell us how many trainable parameters our model has so we can compare the number of parameters across different models.



In [37]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters.')

The model has 2,775,915 trainable parameters.


## Train the Model

Now we'll set up the training and then train the model.
<br>

First, we'll create an optimizer. This is the algorithm we use to update the parameters of the module. Here, we'll use *stochastic gradient descent* (SGD). The first argument is the parameters will be updated by the optimizer, the second is the learning rate, i.e. how much we'll change the parameters by when we do a parameter update.

In [38]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr = 1e-3)

Next, we'll define our loss function. In PyTorch this is commonly called a criterion.
<br>

The loss function here is *cross entropy loss*.
<br>

Our model currently outputs an unbound real number. As our labels are either 0, 1, or 2, we want to restrict the predictions to a number between 0, 1, and 2. We do this using the *softmax* function.
<br>

We then use this this bound scalar to calculate the loss using cross entropy.
<br>

The `CrossEntropyLoss` criterion carries out both the softmax and the cross entropy steps.

In [39]:
criterion = nn.CrossEntropyLoss()

Using `.to`, we can place the model and the criterion on the GPU (if we have one).



In [40]:
model = model.to(device)
criterion = criterion.to(device)

Our criterion function calculates the loss, however we have to write our function to calculate the accuracy.

In [41]:
def multiclass_accuracy(y_pred, y_true):
    
    """
    Function to calculate accuracy
    -> param y_true: list of true values
    -> param y_pred: list of predicted values
    -> return: accuracy score
    
    """
    
    # Intitializing variable to store count of correctly predicted classes
    correct_predictions = 0
#     print(torch.round(torch.argmax(torch.softmax(y_pred))))
#     print(y_pred)
#     print(torch.round(torch.sigmoid(y_pred)))
    for yt, yp in zip(y_true, y_pred):
        yp = torch.argmax(yp)
        if yt == yp:
            
            correct_predictions += 1
    
    #returns accuracy
    return correct_predictions / len(y_true)

**Define Train Function** <br>
The train function iterates over all examples, one batch at a time.



In [42]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    loop = tqdm(iterator)
    for batch in loop:
        
        optimizer.zero_grad()
        
        predictions = model(batch.message).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = multiclass_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += (predictions.argmax(1) == batch.label).sum().item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

`evaluate` is similar to `train`, with a few modifications as you don't want to update the parameters when evaluating.

In [43]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
#     loop = tqdm(iterator)

    with torch.no_grad():
        
        for batch in iterator:
            
            predictions = model(batch.message).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = multiclass_accuracy(predictions, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We'll also create a function to tell us how long an epoch takes to compare training times between models.



In [44]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.
<br>

At each epoch, if the validation loss is the best we have seen so far, we'll save the parameters of the model and then after training has finished we'll use that model on the test set.`

In [45]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'models/rnn-model.pt')
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:26<00:00, 18.73it/s]


Epoch: 01 | Epoch Time: 0m 27s
	Train Loss: 1.102 | Train Acc: 3325.45%
	 Val. Loss: 1.101 |  Val. Acc: 34.25%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.15it/s]


Epoch: 02 | Epoch Time: 0m 23s
	Train Loss: 1.099 | Train Acc: 3397.60%
	 Val. Loss: 1.097 |  Val. Acc: 35.85%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.10it/s]


Epoch: 03 | Epoch Time: 0m 23s
	Train Loss: 1.097 | Train Acc: 3449.70%
	 Val. Loss: 1.094 |  Val. Acc: 37.00%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:23<00:00, 21.64it/s]


Epoch: 04 | Epoch Time: 0m 23s
	Train Loss: 1.095 | Train Acc: 3491.98%
	 Val. Loss: 1.092 |  Val. Acc: 37.52%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:23<00:00, 21.57it/s]


Epoch: 05 | Epoch Time: 0m 23s
	Train Loss: 1.094 | Train Acc: 3541.88%
	 Val. Loss: 1.090 |  Val. Acc: 37.74%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 21.80it/s]


Epoch: 06 | Epoch Time: 0m 23s
	Train Loss: 1.092 | Train Acc: 3587.17%
	 Val. Loss: 1.088 |  Val. Acc: 38.49%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:23<00:00, 21.37it/s]


Epoch: 07 | Epoch Time: 0m 24s
	Train Loss: 1.090 | Train Acc: 3638.28%
	 Val. Loss: 1.086 |  Val. Acc: 38.93%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:23<00:00, 21.35it/s]


Epoch: 08 | Epoch Time: 0m 24s
	Train Loss: 1.089 | Train Acc: 3660.32%
	 Val. Loss: 1.084 |  Val. Acc: 39.42%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 21.76it/s]


Epoch: 09 | Epoch Time: 0m 23s
	Train Loss: 1.088 | Train Acc: 3694.19%
	 Val. Loss: 1.083 |  Val. Acc: 39.96%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:26<00:00, 19.14it/s]


Epoch: 10 | Epoch Time: 0m 27s
	Train Loss: 1.087 | Train Acc: 3709.62%
	 Val. Loss: 1.081 |  Val. Acc: 40.14%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:27<00:00, 18.45it/s]


Epoch: 11 | Epoch Time: 0m 28s
	Train Loss: 1.086 | Train Acc: 3734.67%
	 Val. Loss: 1.080 |  Val. Acc: 40.22%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:26<00:00, 19.16it/s]


Epoch: 12 | Epoch Time: 0m 27s
	Train Loss: 1.085 | Train Acc: 3743.49%
	 Val. Loss: 1.079 |  Val. Acc: 40.42%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:25<00:00, 19.66it/s]


Epoch: 13 | Epoch Time: 0m 26s
	Train Loss: 1.084 | Train Acc: 3760.92%
	 Val. Loss: 1.078 |  Val. Acc: 40.74%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.46it/s]


Epoch: 14 | Epoch Time: 0m 23s
	Train Loss: 1.083 | Train Acc: 3785.77%
	 Val. Loss: 1.077 |  Val. Acc: 40.76%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.52it/s]


Epoch: 15 | Epoch Time: 0m 22s
	Train Loss: 1.083 | Train Acc: 3802.61%
	 Val. Loss: 1.076 |  Val. Acc: 40.89%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.03it/s]


Epoch: 16 | Epoch Time: 0m 23s
	Train Loss: 1.081 | Train Acc: 3815.63%
	 Val. Loss: 1.075 |  Val. Acc: 40.87%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.38it/s]


Epoch: 17 | Epoch Time: 0m 23s
	Train Loss: 1.081 | Train Acc: 3828.86%
	 Val. Loss: 1.074 |  Val. Acc: 40.89%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 22.20it/s]


Epoch: 18 | Epoch Time: 0m 23s
	Train Loss: 1.080 | Train Acc: 3840.68%
	 Val. Loss: 1.073 |  Val. Acc: 41.04%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:23<00:00, 21.65it/s]


Epoch: 19 | Epoch Time: 0m 23s
	Train Loss: 1.079 | Train Acc: 3857.72%
	 Val. Loss: 1.073 |  Val. Acc: 41.05%


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [00:22<00:00, 21.97it/s]


Epoch: 20 | Epoch Time: 0m 23s
	Train Loss: 1.079 | Train Acc: 3871.74%
	 Val. Loss: 1.072 |  Val. Acc: 41.17%


You may have noticed the loss is not really decreasing and the accuracy is poor. This is due to several issues with the model which will be improved later on.
<br>

Finally, the metric we actually care about, the test loss and accuracy, which we get from our parameters that gave us the best validation loss.

In [46]:
model.load_state_dict(torch.load('models/rnn-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.074 | Test Acc: 40.20%
