In [1]:
import torch
from torchtext import data
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy',
                 tokenizer_language='en_core_web_sm',
                 include_lengths=True)
LABEL = data.LabelField(dtype=torch.long)

In [2]:
train_x_dir = "data/twitter_dataset/tweets_train_tokens.csv"

tweets_df = pd.read_csv(train_x_dir)
tweets_df['label'] = tweets_df['label'].astype('int')

tweets_df

Unnamed: 0,message,label
0,arirang simply kpop kim hyung jun cross ha yeo...,1
1,read politico article donald trump running mat...,1
2,type bazura project google image image photo d...,1
3,fast lerner subpoena tech guy work hillary pri...,1
4,sony reward app like lot female singer non ret...,0
...,...,...
49670,sleep think fuck jordan answer phone tomorrow ...,0
49671,yoga shannon tomorrow morning work day start u...,1
49672,bring dunkin iced coffee tomorrow hero,1
49673,currently holiday portugal come home tomorrow ...,1


In [3]:
tweets_df.dropna(inplace=True)
tweets_df.drop_duplicates(inplace=True)
tweets_df.loc[tweets_df['message'].str.contains('beyonce pearl')]

Unnamed: 0,message,label
9994,beyonce pearl jam ed sheeran coldplay play glo...,2
10770,rt kiss beyonce pearl bedd night wass wife,2


In [4]:
tweets_df['label'].value_counts(normalize=True)

1    0.447492
2    0.396289
0    0.156219
Name: label, dtype: float64

**Split data to train, test,validation**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(tweets_df['message'], tweets_df['label'], test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
valid_data = pd.concat([X_valid, y_valid], axis=1)

train_data['label'].value_counts()

1    15968
2    14042
0     5607
Name: label, dtype: int64

In [6]:
train_data.to_csv("train_tweets.csv", index=False)
test_data.to_csv("test_tweets.csv", index=False)
valid_data.to_csv("valid_tweets.csv", index=False)

## Load data as TabularDataset from Pytorch

In [7]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                            path = 'data/twitter_dataset/split/',
                            train = 'train_tweets.csv',
                            test = 'test_tweets.csv',
                            validation='valid_tweets.csv',
                            format='csv',
                            skip_header=True,
                            fields=[('message', TEXT), ('label', LABEL)]
                        )

print(f'Length of training examples: {len(train_data)}')
print(f'Length of testing examples: {len(test_data)}')
print(f'Length of validation examples: {len(valid_data)}')

Length of training examples: 35617
Length of testing examples: 9894
Length of validation examples: 3958


In [8]:
print(vars(train_data.examples[0]))

{'message': ['low', 'key', 'want', 'drop', 'justin', 'school', 'tomorrow', 'bump', 'taylor', 'swift', 'embarrass'], 'label': '1'}


## Create BucketIterator for the data

In [9]:
MAX_VOCAB_SIZE = 40_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [10]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 29703
Unique tokens in LABEL vocabulary: 3


In [11]:
LABEL.vocab.stoi

defaultdict(None, {'1': 0, '2': 1, '0': 2})

In [12]:
print(TEXT.vocab.freqs.most_common(20))

[('tomorrow', 5749), ('day', 3357), ('night', 2372), ('friday', 2213), ('sunday', 2104), ('time', 2084), ('like', 2046), ('good', 2000), ('come', 1847), ('watch', 1701), ('saturday', 1652), ('game', 1577), ('new', 1458), ('monday', 1397), ('want', 1384), ('amp', 1379), ('think', 1340), ('know', 1328), ('today', 1201), ('play', 1156)]


In [13]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'tomorrow', 'day', 'night', 'friday', 'sunday', 'time', 'like', 'good']


In [14]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
                                    (train_data, valid_data, test_data),
                                    batch_size=BATCH_SIZE,
                                    device=device,
                                    sort_key = lambda x: len(x.message),
                                    sort_within_batch=True
                                )

## Build the Model

In [15]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes,
                output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1,
                               out_channels = n_filters,
                               kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1,
                               out_channels = n_filters,
                               kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1,
                               out_channels = n_filters,
                               kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        # text = [batch size, sentence length]
        
        embedded = self.embedding(text)
        
        # embedded = [batch_sze, sentence_length, emb_dim]
        
        embeddded = embedded.unsqueeze(1)
        
        # embedded = [batch_size, 1, sentence_length, emb_dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        
        # conved_n = [batch_size, n_filters, sentence_length - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        # pooled_n = [batch_size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        # cat = [batch_size, n_filters * len(filter_sizes)]
        
        return self.fc(cat)

Currently the `CNN` model can only use 3 different-sized filters, but we can actually improve the code of our model to make it mode generic and take any number of filters.
<br>

We do this by placing all of convolutional layers in a `nn.ModuleList`, a function used to hold a list of PyTorch `nn.Module`s. If we simply used a standard Python list, the modules within the list cannot be "seen" by any modules outside the list which will cause some errors.
<br>

We can now pass an arbitray-sized list of filter sizes and the list comprehension will create a convolutional layer for each of them. Then, in the `forward` method we iterate through the list applying each convolutional layer to get a a list of convolutional outputs, which we also feed through the max pooling in a list comprehension before concatenating together and passing through the dropout and linear layers.

In [16]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, 
                output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1,
                                             out_channels = n_filters,
                                             kernel_size = (fs, embedding_dim)
                                             )
                                    for fs in filter_sizes
                                ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        # text = [batch_size, sentence_length]
        
        embedded = self.embedding(text)
        
        #embedded = [batch_size, sentence_length, emb_dim]
        
        embedded = embedded.unsqueeze(1)
        
        # embedded = [batch_size, 1, sentence_length, embed_dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        # conved_n = [batch_sizes, n_filters, sentence_length - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
#         pooled_0 = F.max_pool1d(conved[0], conved[0].shape[2]).squeeze(2)
#         pooled_1 = F.max_pool1d(conved[1], conved[1].shape[2]).squeeze(2)
#         pooled_2 = F.max_pool1d(conved[2], conved[2].shape[2]).squeeze(2)
        
        # pooled_n = [batch_size, n_fiters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        # cat = [batch_size, n_filters * len(filter_sizes)]
        
        return self.fc(cat)

We can also implement the above model using 1-dimensional convolutional layers, where the embedding dimension is the "depth" of the filter and the number of tokens in the sentence is the width.
<br> 

We'll run our texts in this notebook using the 2-dimensional convolutional model, but leave the implementation for the 1-dimensional model below for anyone interested.


In [17]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, 
                output_dim, dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim,
                                             out_channels = n_filters,
                                             kernel_size = fs)
                                    for fs in filter_sizes
                                ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        # text = [batch_size, sentence_length]
        
        embedded = self.embedding(text)
        
        # embedded = [batch_size, sentence_length, embed_dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        # embedded = [batch_size, embed_dim, sentence_length]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        
        # conved_n = [batch_size, n_filters, sentence_length - filter_size[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # pooled_n = [batch_size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        # cat = [batch_size, n_filters * len(filter_sizes)]
        
        return self.fc(cat)

We create an instance of our `CNN` class.
<br> 


In [18]:

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [19]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)
print(model)

RNN(
  (embedding): Embedding(29703, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [20]:
# INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 100
# N_FILTERS = 100
# FILTER_SIZES = [3,4,5]
# OUTPUT_DIM = 3
# DROPOUT = 0.5
# PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
# print(model)

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,281,983 trainable parameters


Load the pre-trained embeddings

In [22]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [ 0.0085,  0.1802,  0.7703,  ...,  0.2993,  0.4085,  0.4761],
        ...,
        [-0.5832, -0.5807,  0.3504,  ...,  0.4043, -0.0192,  0.0945],
        [ 1.7948, -0.4856,  0.3013,  ..., -0.8426, -0.8726,  0.2949],
        [ 0.1527, -1.0775, -0.2867,  ..., -0.4466, -0.5326, -1.3527]])

Then zero the initial weights of the unknown and padding tokens.

In [23]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Train the Model

We first have to initialize the optimizer, loss function (criterion) and place the model and criterion on the GPU (if available)

In [24]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

We define a function for training our model...

**Note**: As dropout is used, it is always advisable to use `model.train()` to ensure the dropout is "turned on" while training.

In [29]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    loop = tqdm(iterator)
    
    for batch in loop:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.message
        
        predictions = model.forward(text, text_lengths).squeeze(1)
#         print(predictions.argmax())
        loss = criterion(predictions, batch.label)
        
        acc = (predictions.argmax() == batch.label).sum()
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
        

We define a function for testing our model...

**Note**: Again, since dropout is used, `model.eval()` must be called to ensure the dropout is "turned off" while evaluating.

In [30]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        
        for batch in iterator:
            
            text, text_lengths = batch.message
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = (predictions.argmax() == batch.label).sum()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Let's define our function to tell us how long epochs take.

In [31]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

Finally, we train our model...

In [32]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "models/cnn-model.pt")
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|████████████████████████████████████████████████████████████████████████████████| 557/557 [02:27<00:00,  3.78it/s]


Epoch: 01 | Epoch Time: 2m 30s
	Train Loss: 0.880 | Train Acc: 19.21%
	 Val. Loss: 0.798 |  Val. Acc: 0.00%


100%|████████████████████████████████████████████████████████████████████████████████| 557/557 [02:28<00:00,  3.76it/s]


Epoch: 02 | Epoch Time: 2m 31s
	Train Loss: 0.798 | Train Acc: 48.47%
	 Val. Loss: 0.769 |  Val. Acc: 41.94%


100%|████████████████████████████████████████████████████████████████████████████████| 557/557 [02:24<00:00,  3.85it/s]


Epoch: 03 | Epoch Time: 2m 27s
	Train Loss: 0.748 | Train Acc: 53.14%
	 Val. Loss: 0.754 |  Val. Acc: 17.74%


100%|████████████████████████████████████████████████████████████████████████████████| 557/557 [02:35<00:00,  3.59it/s]


Epoch: 04 | Epoch Time: 2m 39s
	Train Loss: 0.709 | Train Acc: 26.39%
	 Val. Loss: 0.744 |  Val. Acc: 0.00%


100%|████████████████████████████████████████████████████████████████████████████████| 557/557 [02:42<00:00,  3.43it/s]


Epoch: 05 | Epoch Time: 2m 45s
	Train Loss: 0.670 | Train Acc: 26.75%
	 Val. Loss: 0.760 |  Val. Acc: 0.00%


In [34]:

model.load_state_dict(torch.load('models/cnn-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.757 | Test Acc: 29.03%
