In [1]:
%matplotlib inline
import torch.nn as nn
from random import randint
from collections import Counter
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import random, torch, numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.sampler import SubsetRandomSampler

In [2]:
def get_shape(w, h, f, p, s):
    '''
    Args:
        w: Width of input
        h: Height of input
        f: Kernel width/height
        p: Padding
        s: Stride
    '''
    w = ((w-f+2*p)/s)+1
    h = ((h-f+2*p)/s)+1
    return ('Conv Shape:', [w, h])

def get_seq_words(seq):
    return seq.split()

def get_max_len(doc):
    max_len = 0
    for x in doc:
        if len(x) > max_len:
            max_len = len(x)
    return max_len

def chunk_seq(seq):
    chunked_seq = []
    for i in range(0, len(seq), 5):
        chunked_seq.append(seq[i:i+5])
    return chunked_seq

def get_label(seq):
    labels = []
    seq = seq.split()
    if len(seq) < 5:
        pass
    else:
        for word in seq:
            if ',' in word:
                labels.append(1)
            elif '.' in word:
                labels.append(2)
            else:
                labels.append(0)
        return labels  

In [3]:
data = open('./data/processed/ted_data', 'r').read()
len(data)

28218860

In [4]:
words = data.lower().split()
len(words)

5030903

In [5]:
print(words[:5])

['good', 'morning.', 'how', 'are', "you?(laughter)it's"]


In [6]:
vocab = tuple(set(words))
print(len(vocab))

181375


In [7]:
max_seq_len = get_max_len(words)
print(max_seq_len)

84


In [8]:
x = chunk_seq(words)
sequences = [' '.join(seq) for seq in x]
print(len(sequences))

with open('aaa', 'w') as f:
    for y in sequences:
        f.write(y+'\n')

1006181


In [9]:
labels = []
for seq in sequences:
    seq = seq.split()
    if len(seq) < 5:
        pass
    elif ',' in seq[2]:
        labels.append([1,0,0])
    elif '.' in seq[2]:
        labels.append([0,1,0])
    else:
        labels.append([0,0,0])

In [10]:
Counter(str(lab) for lab in labels)

Counter({'[0, 0, 0]': 877643, '[0, 1, 0]': 57695, '[1, 0, 0]': 70842})

In [11]:
print(len(labels))

1006180


In [22]:
train_data = sequences[:int(0.8*len(sequences))]
train_labels = labels[:int(0.8*len(labels))]
# test_data = sequences[-int(0.8*len(sequences)):]
# test_labels = labels[-int(0.8*len(labels)):]

partition = {}
train_indices = []
valid_indices = []

print('Number of testing data: {}'.format(len(train_data)-valid_data))
valid_data = int(0.2*len(train_data))
print('Number of validation data: {}'.format(valid_data))

for i in range((len(train_data)-valid_data)):
    train_indices.append(i)
    
for i in range((len(train_data)-valid_data), len(train_data)):
    valid_indices.append(i)
    
# Dictionary to hold indices of training and validation data
partition['train'] = train_indices
partition['valid'] = valid_indices

train_valid_labels = {}
for i in range(len(train_data)):
    train_valid_labels[i] = labels[i]

# num_train = len(train_data)
# indices = list(range(num_train))
# np.random.shuffle(indices)
# split = int(np.floor(0.2 * num_train))
# train_idx, valid_idx = indices[split:], indices[:split]

# # define samplers for obtaining training and validation batches
# train_sampler = SubsetRandomSampler(train_idx)
# valid_sampler = SubsetRandomSampler(valid_idx)

# # prepare data loaders (combine dataset and sampler)
# train_loader = torch.utils.data.DataLoader(train_data, batch_size=20,
#     sampler=train_sampler, num_workers=0)
# valid_loader = torch.utils.data.DataLoader(train_data, batch_size=20, 
#     sampler=valid_sampler, num_workers=0)
# test_loader = torch.utils.data.DataLoader(test_data, batch_size=20, 
#     num_workers=0)

# print('Number of Train Data/Labels: {}/{}'.format(len(train_data), len(train_labels)))
# print('Number of Validation Data/Labels: {}/{}'.format(len(valid_data), len(valid_labels)))
# print('Number of Test Data/Labels: {}/{}'.format(len(test_data), len(test_labels)))

Number of testing data: 643956
Number of validation data: 160988


In [29]:
for x in range(100):
    print(train_valid_labels[x])

[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0, 0]


In [30]:
with open('awdad', 'w') as f:
    for x in train_data:
        f.write(x+'\n')

with open('czfez', 'w') as f:
    for x in train_labels:
        f.write(str(x)+'\n')

In [104]:
# define the CNN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embed = nn.Embedding(num_embeddings=181375, embedding_dim=64)
        self.dropout = nn.Dropout(p=0.2)
        # convolutional layer
        # 5 x 64 x 1 = 320
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=0)
        # 2 x 32 x 128 = 8192
        self.conv2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0)
        # 1 x 16 x 256 = 4096
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=0)
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        
        # 2048 > 256
        self.full_1 = nn.Linear(512*1*3, 512)
        self.output = nn.Linear(512, 10)                

    def forward(self, x):
        # add sequence of convolutional and max pooling layers
        x = self.pool(self.dropout(F.relu(self.conv1(x))))
        print('Conv_1:', x.shape)
        x = self.pool(self.dropout(F.relu(self.conv2(x))))
        print('Conv_2:', x.shape)
        x = self.pool(self.dropout(F.relu(self.conv3(x))))
        print('Conv_3:', x.shape)
        x = x.view(x.shape[0], -1)
        x = self.dropout(F.relu(self.full_1(x)))
        x = self.output(x)
        return x

# create a complete CNN
model = Net()
print(model)

# move tensors to GPU if CUDA is available
if torch.cuda.is_available():
    model.cuda()

Net(
  (embed): Embedding(181375, 64)
  (dropout): Dropout(p=0.2)
  (conv1): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (full_1): Linear(in_features=1536, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=10, bias=True)
)


In [106]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# number of epochs to train the model
n_epochs = 8 # you may increase this number to train a final model

valid_loss_min = np.Inf # track change in validation loss

for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    for data, target in train_loader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval()
    for data, target in valid_loader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
    
    # calculate average losses
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model_cifar.pt')
        valid_loss_min = valid_loss