In [1]:
%matplotlib inline
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import random, torch, numpy as np
from random import randint
from string import punctuation
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [2]:
def get_shape(w, h, f, p, s):
    '''
    Args:
        w: Width of input
        h: Height of input
        f: Kernel width/height
        p: Padding
        s: Stride
    '''
    w = ((w-f+2*p)/s)+1
    h = ((h-f+2*p)/s)+1
    return ('Conv Shape:', [w, h])

def get_seq_words(seq):
    return seq.split()

def get_max_len(doc):
    max_len = 0
    for x in doc:
        if len(x) > max_len:
            max_len = len(x)
    return max_len

def chunk_seq(seq):
    chunked_seq = []
    for i in range(0, len(seq), 5):
        chunked_seq.append(seq[i:i+5])
    return chunked_seq

def get_label(seq):
    labels = []
    seq = seq.split()
    if len(seq) < 5:
        pass
    else:
        for word in seq:
            if ',' in word:
                labels.append(1)
            elif '.' in word:
                labels.append(2)
            else:
                labels.append(0)
        return labels  

In [3]:
data = open('./data/processed/ted_data', 'r', encoding='utf-8').read()
len(data)

28218860

In [4]:
data = data.lower()
data_split = data.split('\n')
all_data = ' '.join(data_split)
print(all_data[:10])

good morni


In [5]:
words = all_data.split()
words[:10]

['good',
 'morning.',
 'how',
 'are',
 "you?(laughter)it's",
 'been',
 'great,',
 "hasn't",
 'it?',
 "i've"]

In [6]:
x = chunk_seq(words)
sequences = [' '.join(seq) for seq in x]

with open('processed_data', 'w', encoding='utf-8') as f:
    for seq in sequences:
        f.write(seq+'\n')

In [7]:
labels = []
for seq in sequences:
    seq = seq.split()
    if ',' in seq[2]:
        labels.append('<comma>')
    elif '.' in seq[2]:
        labels.append('<period>')
    else:
        labels.append('<na>')
        
with open('labels', 'w', encoding='utf-8') as f:
    for label in labels:
        f.write(label+'\n')

In [8]:
# Check number of sequences and labels
print('Number of sequences: \t{}'.format(len(sequences)))
print('Number of labels: \t{}'.format(len(labels)))

Number of sequences: 	1006181
Number of labels: 	1006181


In [9]:
Counter(label for label in labels)

Counter({'<na>': 877643, '<period>': 57696, '<comma>': 70842})

In [10]:
# Build vocab, 
words_in_vocab = Counter(words)
vocab = sorted(words_in_vocab, key=words_in_vocab.get, reverse=True)

# Skip most common word
vocab_to_int = {word: index for index, word in enumerate(vocab, 1)}

In [11]:
# Tokenize sequences
seq_int = []
for seq in sequences:
    seq_int.append([vocab_to_int[word] for word in seq.split()])

In [12]:
print('Number of unique words: {}'.format(len(vocab_to_int)))
print('Check tokenized sequences: \n', seq_int[:5])

Number of unique words: 181375
Check tokenized sequences: 
 [[136, 3505, 51, 16, 75870], [80, 2023, 1890, 684, 151], [80, 5526, 324, 47, 1], [219, 571, 7, 232, 71], [75871, 18, 80, 153, 8970]]


In [13]:
# Encode labels
encoded_labels = []
for label in labels:
    if label == '<comma>':
        encoded_labels.append(1)
    elif label == '<period>':
        encoded_labels.append(2)
    else:
        encoded_labels.append(0)
encoded_labels = np.array(encoded_labels)
print(encoded_labels[20:36])

[2 2 0 0 0 0 0 0 0 2 0 0 0 1 1 0]


In [14]:
# Check for outliers
seq_len = Counter([len(seq) for seq in seq_int])
print("Zero-length reviews: {}".format(seq_len[0]))
print("Maximum review length: {}".format(max(seq_len)))

# One sequence with length 3
print('Sequence lengths: ', seq_len)

Zero-length reviews: 0
Maximum review length: 5
Sequence lengths:  Counter({5: 1006180, 3: 1})


In [15]:
# Pad sequences to 5 or sequence length, post padding
features = np.zeros((len(seq_int), 5), dtype=int)

for i, row in enumerate(seq_int):
    features[i, :len(row)] = np.array(row)[:5]

# Check that all sequences at at length 5
assert len(features)==len(seq_int)
assert len(features[0])==5

In [16]:
print(features[-1])

[   169 181375  14894      0      0]


In [17]:
train_test_split_frac = 0.8
split_index = int(0.8*len(features))

# Split data into training, validation, and test data (features and labels, x and y)
train_x, left_over_x = features[:split_index], features[split_index:]
train_y, left_over_y = encoded_labels[:split_index], encoded_labels[split_index:]

val_test_index = int(0.5*len(left_over_x))
print('Validation/Test amount: \t{}'.format(val_test_index))

val_x, test_x = left_over_x[:val_test_index], left_over_x[val_test_index:]
val_y, test_y = left_over_y[:val_test_index], left_over_y[val_test_index:]

## print out the shapes of your resultant feature data
print('Training Dataset: \t{}'.format(train_x.shape), train_y.shape)
print('Validation Dataset: \t{}'.format(val_x.shape), val_y.shape)
print('Testing Dataset: \t{}'.format(test_x.shape), test_y.shape)

Validation/Test amount: 	100618
Training Dataset: 	(804944, 5) (804944,)
Validation Dataset: 	(100618, 5) (100618,)
Testing Dataset: 	(100619, 5) (100619,)


In [18]:
# Create dataloaders
batch_size = 128

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [19]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([128, 5])
Sample input: 
 tensor([[ 68155,      4,    119,  11340,    119],
        [ 39571,   1082,     17,  18749,      9],
        [     5,    255,     40,   1738,      2],
        [    20,     19,     26,    100,     28],
        [   456,      1,    831,      4,    262],
        [    30,  72078,    292,   2207,      2],
        [   263,      5,    122,      4,    513],
        [    28,    173,      3,     67,    366],
        [     2,    789,    140,      7,      5],
        [    10,     78,    142,     44,   8057],
        [     1,  24395,      4,      1,  86044],
        [   125,      1,    658,   1947,     24],
        [    57,     32,  16102,      8,   3892],
        [   165,   1532,     27,    308,      8],
        [113416,     23,      8,     26,    415],
        [   656,  22408,      2,     73,   6295],
        [    77,    189,   1671,  10089,     36],
        [    39,     12,     23,     92,      6],
        [    73,   1187,      1,   6299,   

In [20]:
# Check if GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device to train on:', device)

Device to train on: cuda:0


In [32]:
# Define the CNN architecture
class Net(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim):
        super(Net, self).__init__()
        
        self.output_size = output_size
        
        # Embedding Layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)        
        
        # Convolution and Maxpool Layers
        # Sees 181376 * 64 * 1 = 2902016
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=50, kernel_size=(3,embedding_dim), stride=1, padding=0)
        # See 90688 * 32 * 16 = 46432256
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=50, kernel_size=(3,embedding_dim), stride=1, padding=0)
#         # 45344 * 64 * 64 = 742916096
#         self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=0)
        # max pooling layer
        self.pool = nn.MaxPool1d(2)
        
        # Dropout Layer
        self.dropout = nn.Dropout(p=0.2)
        
        # 2048 > 256
        # Fully connected and output Layers
        self.full_1 = nn.Linear(1, 3)
#         self.output = nn.Linear(128, output_size)                

    def forward(self, x):
        print('x shape:',x.shape)
        x = self.embedding(x)
        # add sequence of convolutional and max pooling layers
        x = self.pool(self.dropout(F.relu(self.conv1(x))))
        print('Conv_1:', x.shape)
        x = self.pool(self.dropout(F.relu(self.conv2(x))))
        print('Conv_2:', x.shape)
        x = self.pool(self.dropout(F.relu(self.conv3(x))))
        print('Conv_3:', x.shape)
        x = x.view(x.shape[0], -1)
        x = self.dropout(F.relu(self.full_1(x)))
        x = self.output(x)
        return x

In [33]:
# Initialize CNN
vocab_size = len(vocab_to_int)+1
output_size = 3
embedding_dim = 64

model = Net(vocab_size, output_size, embedding_dim)
print(model)

# move tensors to GPU if CUDA is available
if torch.cuda.is_available():
    model.to(device)

Net(
  (embedding): Embedding(181376, 64)
  (conv1): Conv2d(1, 50, kernel_size=(3, 64), stride=(1, 1))
  (conv2): Conv2d(1, 50, kernel_size=(3, 64), stride=(1, 1))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.2)
  (full_1): Linear(in_features=1, out_features=3, bias=True)
)


In [35]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [36]:
# number of epochs to train the model
n_epochs = 8 # you may increase this number to train a final model

valid_loss_min = np.Inf # track change in validation loss

for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    for data, target in train_loader:
        # move tensors to GPU if CUDA is available
        if torch.cuda.is_available():
            data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval()
    for data, target in valid_loader:
        # move tensors to GPU if CUDA is available
        if torch.cuda.is_available():
            data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
    
    # calculate average losses
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model_cifar.pt')
        valid_loss_min = valid_loss

x shape: torch.Size([128, 5])


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got CUDAIntTensor instead (while checking arguments for embedding)