In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext import data
from torchtext import datasets
import random
import torch.optim as optim
import pandas as pd

SEED = 1234
torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
tag = 'sentiment'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
def load_dataset(tag):
    if tag == 'irrelevant':
        train_df = pd.read_csv('data/irrelevant_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/irrelevant_test.tsv', sep='\t')
    elif tag == 'sentiment':
        train_df = pd.read_csv('data/sentiment_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/sentiment_test.tsv', sep='\t')
    else:
        train_df = None
        valid_df = None
    
    return train_df, valid_df

train_df, valid_df = load_dataset(tag)

In [6]:
## SPACY에서 제공하는 TOKENIZER를 사용
TEXT = torchtext.legacy.data.Field(tokenize='spacy')
LABEL = torchtext.legacy.data.LabelField()

# id␞sentence␞sentiment
fields = [(None, None),(None, None), ('sentence', TEXT),('sentiment', LABEL)]

#loading custom dataset
training_data = torchtext.legacy.data.TabularDataset(
    path = 'data/'+tag+'_train.tsv',
    format = 'tsv',
    fields = fields,
    skip_header = True
)

valid_data = torchtext.legacy.data.TabularDataset(
    path = 'data/'+tag+'_test.tsv',
    format = 'tsv',
    fields = fields,
    skip_header = True
)

# print preprocessed text
print(vars(training_data.examples[0]))
print(vars(valid_data.examples[0]))

{'sentence': ['UK', 'is', 'also', 'extending', 'to', 'Hong', 'Kong', 'the', 'arms', 'embargo', 'that', 'is', 'in', 'force', 'on', 'mainland', 'China', 'since', '1989', '.'], 'sentiment': '0'}
{'sentence': ['The', 'United', 'States', 'and', 'NATO', 'countries', 'often', 'send', 'aircraft', 'and', 'drones', 'to', 'perform', 'reconnaissance', 'activities', 'along', 'Russia', '’s', 'borders', 'in', 'the', 'Baltic', ',', 'in', 'the', 'Black', 'Sea', 'off', 'Crimea', ',', 'and', 'Krasnodar', '.'], 'sentiment': '0'}


In [7]:
train_data, test_data = training_data.split(split_ratio=0.9, random_state = random.seed(SEED))

In [8]:
# 여러개 다양한 Glove pre-trained vector를 활용할 수 있다.
# vectors = "glove.42B.300d",
TEXT.build_vocab(train_data, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print("Size of TEXT vocabulary:",len(TEXT.vocab))
print("Size of LABEL vocabulary:",len(LABEL.vocab))

.vector_cache\glove.6B.zip: 862MB [06:14, 2.30MB/s]                                
100%|█████████▉| 399999/400000 [00:13<00:00, 28860.06it/s]


Size of TEXT vocabulary: 55820
Size of LABEL vocabulary: 2


In [11]:
BATCH_SIZE = 64

train_iterator, test_iterator, valid_iterator = torchtext.legacy.data.BucketIterator.splits(
    (train_data, test_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.sentence),
    sort_within_batch=True,
    device = device)

In [12]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1, 0)        # [batch size, sent len]        
        embedded = self.embedding(text)  # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1) # [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # [batch size, n_filters, sent len - filter_sizes[n]]        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim = 1)) # [batch size, n_filters * len(filter_sizes)]

        return self.fc(cat)

In [13]:
# 대부분의 hyperparameter를 최적으로 찾는 것이 중요하다
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100 # Glove Vector의 dimension과 일치하여야 한다
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.4
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,672,902 trainable parameters


In [15]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.2521,  0.5977,  0.2078,  ..., -0.9296, -0.5892, -0.3702],
        [-0.4668,  0.7340, -1.2766,  ..., -0.5542, -0.6220, -2.8315],
        [ 0.2032, -1.6553,  0.2869,  ...,  1.4848, -0.0791, -0.5520]])

In [16]:
# Then zero the initial weights of the unknown and padding tokens.
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [17]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [18]:
# 학습 중간 batch별 accuracy를 측정하기 위한 helper function
def categorical_accuracy(preds, y):

    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    
    return correct.sum().detach().cpu().numpy() / y.shape[0]

In [19]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:        
        optimizer.zero_grad()
        predictions = model(batch.sentence)
        loss = criterion(predictions, batch.sentiment)
        acc = categorical_accuracy(predictions, batch.sentiment)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.sentence)
            loss = criterion(predictions, batch.sentiment)
            acc = categorical_accuracy(predictions, batch.sentiment)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
EPOCHS = 100
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model/'+tag+'_CNN_model.pt')

    print(f'Epoch [{epoch + 1}/{EPOCHS}]: Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

Epoch [1/100]: Train Loss: nan, Train Acc: 60.73% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [2/100]: Train Loss: nan, Train Acc: 60.88% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [3/100]: Train Loss: nan, Train Acc: 60.83% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [4/100]: Train Loss: nan, Train Acc: 60.85% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [5/100]: Train Loss: nan, Train Acc: 60.83% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [6/100]: Train Loss: nan, Train Acc: 60.85% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [7/100]: Train Loss: nan, Train Acc: 60.85% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [8/100]: Train Loss: nan, Train Acc: 60.85% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [9/100]: Train Loss: nan, Train Acc: 60.88% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [10/100]: Train Loss: nan, Train Acc: 60.80% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [11/100]: Train Loss: nan, Train Acc: 60.88% | Val. Loss: nan, Val. Acc: 60.93%
Epoch [12/100]: Train Loss: nan, Train Acc: 60.88% | Val. Loss:

### Evaluation & Inference

In [None]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model.load_state_dict(torch.load('model/'+tag+'_CNN_model.pt'))
model.to(device)

CNN(
  (embedding): Embedding(56262, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=2, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)

In [None]:
result = evaluate(model, valid_iterator, criterion)
print(result)

(0.6190095656159995, 0.6658740942028986)





### Get Probabilities

In [20]:
import spacy
nlp = spacy.load('en')

def predict_class(sentence, model, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    softmax_prob = torch.nn.functional.softmax(preds, dim=1)
    max_preds = preds.argmax(dim = 1)
    return softmax_prob, max_preds

In [21]:
# Example
prob, pred_class = predict_class("How many minutes are in six hundred and eighteen hours?", model)
print(prob, pred_class)

tensor([[0.4571, 0.5429]], grad_fn=<SoftmaxBackward>) tensor([1])


In [22]:
from tqdm import tqdm

def save_results(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(df.iterrows()):
            news_id = row['id']
            text = row['sentence'].replace('\t',' ')
            sentiment = row['sentiment']
            class_prob, pred = predict_class(text, model)

            class_prob = [str(x) for x in class_prob.detach().cpu().numpy()[0]]
            pred = pred.detach().cpu().numpy()[0]

            result = str(news_id).replace('\t','')+'\t'+text+'\t'+'\t'.join(class_prob)+'\t'+str(pred)+'\t'+str(int(sentiment)).replace('\t','')
            
            f.write(result+'\n')

In [23]:
save_results(train_df, 'data/'+tag+'_cnn_prediction_train.csv')

41109it [00:42, 961.78it/s] 


In [24]:
save_results(valid_df, 'data/'+tag+'_cnn_prediction_test.csv')

17628it [00:18, 941.22it/s]
