## Preprocessing

In [1]:
import re
def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

## Tokenizer

In [2]:
def tokenize(sentence):
    return sentence.split(" ")

## Preparing Data

As in the previous notebooks, we'll prepare the data. 

Unlike the previous notebook with the FastText model, we no longer explicitly need to create the bi-grams and append them to the end of the sentence.

As convolutional layers expect the batch dimension to be first we can tell TorchText to return the data already permuted using the `batch_first = True` argument on the field.

In [3]:
def read_data(path, label):
    ret = []
    with open(path, "r", encoding = "ISO-8859-1") as f:
        for line in f.readlines():
            ret.append([clean_str(line.replace("\n","")),label])
    return ret

In [4]:
def train_test_split(data, train_ratio = 0.9):
    import random
    import math
    _len = len(data)
    random.shuffle(data)
    train_data = data[:math.ceil(_len*train_ratio)]
    test_data = data[math.ceil(_len*train_ratio):]
    return train_data, test_data

In [5]:
DATA_PATHS = ["./datas/rt-polarity.neg", "./datas/rt-polarity.pos"]
NEGATIVE_DATAS = read_data(DATA_PATHS[0], 1)
POSITVIE_DATAS = read_data(DATA_PATHS[1], 0)

In [6]:
total_datas = NEGATIVE_DATAS + POSITVIE_DATAS
train_data, test_data = train_test_split(total_datas, 0.9)

## Field 생성

In [7]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.data import Example, Dataset
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# custom_tokenizer의 구성
def custom_tokenizer(text):
    return [token for token in text.split(" ")]

TEXT = data.Field(tokenize = custom_tokenizer, batch_first = True)
LABEL = data.LabelField(dtype = torch.float, preprocessing = lambda x: float(x))



## Dataset 생성

#### dataset, DataLoader의 경우 make_torch_dataset에 정리

In [8]:
def make_dataset(datas, fields):
    examples = []
    examples += [Example.fromlist(items, fields) for items in datas]
    custom_dataset = Dataset(examples, fields)
    return custom_dataset

In [9]:
fields=[('text', TEXT), ('label', LABEL)]
train_dataset = make_dataset(train_data, fields=fields)
test_dataset = make_dataset(test_data, fields=fields) 



In [10]:
train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(SEED))

In [11]:
TEXT.build_vocab(train_dataset)
LABEL.build_vocab(train_dataset)

In [12]:
BATCH_SIZE = 128
device = "cpu"
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.text),
    device = device)



## K-fold 10 mean 생성

In [4]:
folds = []
for i in range(10):
    SEED = np.random.randint(10000)
    train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(SEED))
    BATCH_SIZE = 128
    device = "cpu"
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_dataset, valid_dataset, test_dataset), 
        batch_size = BATCH_SIZE, 
        sort_key = lambda x: len(x.text),
        device = device)
    folds.append([trian_iterator, valid_iterator, test_iterator])

## Build Model

In [13]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [14]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [15]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [16]:
class CNN1d_Multichannel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding_static = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.embedding_static.weight.requires_grad = False
        self.embedding_nonstatic = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        static_embedded = self.embedding_static(text)
        non_static_embedded = self.embedding_nonstatic(text)
        #embedded = [batch size, sent len, emb dim]
        
        static_embedded = static_embedded.permute(0, 2, 1)
        non_static_embedded = non_static_embedded.permute(0, 2, 1)
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(static_embedded)) + F.relu(conv(non_static_embedded)) for conv in self.convs]
        
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

## make CNN model 

In [17]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

cnn1d_model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

## Multichannel model

In [23]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

multi_model = CNN1d_Multichannel(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The CNN1D model has {count_parameters(cnn1d_model):,} trainable parameters')
print(f'The CNN1D-Multichannel model has {count_parameters(multi_model):,} trainable parameters')

The CNN1D model has 4,888,801 trainable parameters
The CNN1D-Multichannel model has 4,888,801 trainable parameters


## READ W2V vectors

In [19]:
from gensim.models import KeyedVectors

# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
w2v_model = KeyedVectors.load_word2vec_format('./datas/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [23]:
from tqdm import notebook
W2V_SIZE = 300
word2vec_vectors = []
lower = -(np.std(w2v_model.vectors, axis=0)**2)
upper = lower*-1
for token, idx in notebook.tqdm(TEXT.vocab.stoi.items()):
    if token in w2v_model.wv.vocab.keys():
        word2vec_vectors.append(torch.FloatTensor(w2v_model[token]))
    else:
        word2vec_vectors.append(torch.FloatTensor(np.random.uniform(lower, upper)))

HBox(children=(IntProgress(value=0, max=14991), HTML(value='')))




  import sys


In [27]:
a = torch.stack(word2vec_vectors)

In [None]:
a.shape

In [28]:
TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)
pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([14991, 300])

### w2v 임베딩 테스트

In [53]:
test_string = "good"
test_string_index = TEXT.vocab.stoi[test_string]
print(f'Test String : {test_string} \nTest String : {test_string_index}')

Test String : good 
Test String : 54


In [72]:
original_vector = w2v_model[test_string]
torch_model_vector = model.embedding(torch.tensor([test_string_index]))[0]
print(f'before Appply\n original : {original_vector[:3]} \n torch_vector : {torch_model_vector[:3]}')
## Apply w2v
TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)
pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
model.embedding.weight.data.copy_(pretrained_embeddings)
torch_model_vector = model.embedding(torch.tensor([test_string_index]))[0]
print(f'before Appply\n original : {original_vector[:3]} \n torch_vector : {torch_model_vector[:3]}')

before Appply
 original : [ 0.04052734  0.0625     -0.01745605] 
 torch_vector : tensor([-1.6922,  0.2445,  0.6354], grad_fn=<SliceBackward>)
before Appply
 original : [ 0.04052734  0.0625     -0.01745605] 
 torch_vector : tensor([ 0.0405,  0.0625, -0.0175], grad_fn=<SliceBackward>)


### pretrain vector setting

In [31]:
model = "multimodel"
if model == "multimodel":
    model = multi_model
    TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)
    pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
    model.embedding_static.weight.data.copy_(pretrained_embeddings)
    model.embedding_nonstatic.weight.data.copy_(pretrained_embeddings)
    # Set Unknown & Pad token vector
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    model.embedding_static.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding_nonstatic.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
elif model == "single":
    model = cnn1d_model
    TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)
    pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)
    model.embedding.weight.data.copy_(pretrained_embeddings)
    # Set Unknown & Pad token vector
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Train the Model

In [32]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
# optimizer = optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [33]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [34]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad() #gradient 초기화
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward() # backpropagation
        
        optimizer.step() # step check
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() # 모델 로드
    
    with torch.no_grad(): # test시에만 작동
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Let's define our function to tell us how long epochs take.

In [36]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Finally, we train our model...

In [37]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.580 | Train Acc: 69.78%
	 Val. Loss: 0.482 |  Val. Acc: 77.09%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.389 | Train Acc: 83.10%
	 Val. Loss: 0.445 |  Val. Acc: 78.14%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 0.266 | Train Acc: 90.04%
	 Val. Loss: 0.454 |  Val. Acc: 78.78%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.165 | Train Acc: 94.48%
	 Val. Loss: 0.505 |  Val. Acc: 77.87%
Epoch: 05 | Epoch Time: 0m 13s
	Train Loss: 0.092 | Train Acc: 97.60%
	 Val. Loss: 0.544 |  Val. Acc: 78.35%
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 0.048 | Train Acc: 99.17%
	 Val. Loss: 0.581 |  Val. Acc: 78.79%
Epoch: 07 | Epoch Time: 0m 12s
	Train Loss: 0.029 | Train Acc: 99.62%
	 Val. Loss: 0.662 |  Val. Acc: 77.49%
Epoch: 08 | Epoch Time: 0m 11s
	Train Loss: 0.017 | Train Acc: 99.85%
	 Val. Loss: 0.706 |  Val. Acc: 77.39%
Epoch: 09 | Epoch Time: 0m 13s
	Train Loss: 0.011 | Train Acc: 99.96%
	 Val. Loss: 0.727 |  Val. Acc: 77.80%
Epoch: 10 | Epoch T

We get test results comparable to the previous 2 models!

In [82]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.418 | Test Acc: 80.35%


## User Input

And again, as a sanity check we can check some input sentences

**Note**: As mentioned in the implementation details, the input sentence has to be at least as long as the largest filter height used. We modify our `predict_sentiment` function to also accept a minimum length argument. If the tokenized input sentence is less than `min_len` tokens, we append padding tokens (`<pad>`) to make it `min_len` tokens.

In [85]:
def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok for tok in sentence.split(" ")]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

An example negative review...

In [95]:
predict_sentiment(model, "This film is not ")

0.6700533032417297

An example positive review...

In [89]:
predict_sentiment(model, "This film is good best")

0.06672604382038116

## embedding is trained?

In [99]:
test_string = "good"
test_string_index = TEXT.vocab.stoi[test_string]
print(f'Test String : {test_string} \nTest String : {test_string_index}')

Test String : good 
Test String : 54


In [101]:
w2v_model['good'][:3]

array([ 0.04052734,  0.0625    , -0.01745605], dtype=float32)

In [104]:
model.embedding(torch.tensor([54]))[0][:3]

tensor([ 0.0034,  0.0289, -0.0004], grad_fn=<SliceBackward>)

### MISSSING

- UNK 의 경우 분산을 구하려면 dataloader로 구성해야할 것 같음..

- L2 norm with dropout layer
- layer weight normalize