# CNN for sentence classification
- References:
    - Yoon Kim's [paper link](https://arxiv.org/abs/1408.5882)
    - [Reference code](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb) by bentrevett
    - Dataset [download](https://github.com/yoonkim/CNN_sentence)
- Implementation Points
    - OOV token initialization: 
        - random sampling by uniform distribution with variances of pre-trained word vectors
    - Static vs. Non-static (task-specific): 
        - freeze=True or False
    - Multiple channels (two channels): "each filter is applied to both channels and the results are **added** to calculate $c_i$"
    - Regularization: "we employ dropout on the penultimate layer with a constraint on $l_2$-norms of the wieght vectors"
        - the penultimate layer **with** a constraint on $l_2$-norms of the wieght vectors:
            - Add $l_2$ regularity from **torch.norm()** to loss function!
        - dropout means that the element-wise multiplcation operator using **a masking vector of Bernoulli random variables with prob. $p$**:
            - **nn.Dropout()**
            - "During training, randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution"
    - Hyperparameters: 
        - relu function, filter windows of 3,4,5 with 100 feature maps each, dropout rate ($p$) of 0.5, $l_2$ constraint of 3 
        - mini-batch size of 50, Adadelta update rule, **dev set is 10% of the training set (Failed!)**

In [1]:
import os
import pandas as pd
import numpy as np
import random
import torch

In [2]:
def set_seed(seed= 1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

## Data preparation

### Data load
- Note that we should use **open** function instead of **pd.read_table()** when data size is big becuase pandas is slower!

In [3]:
# 데이터 준비 함수 (None -> df)
def load_data():
    neg_path = r"C:\Users\Simon\ongoing_projects\torch_study\1_CNN\1_reference_code\rt-polarity.neg.txt"
    neg_df = pd.read_table(neg_path, header=None, names=['X'], encoding='latin') #'ISO-8859-1' 의 alias
    neg_df['y'] = [0] * len(neg_df)

    pos_path = r"C:\Users\Simon\ongoing_projects\torch_study\1_CNN\1_reference_code\rt-polarity.pos.txt"
    pos_df = pd.read_table(pos_path, header=None, names=['X'], encoding='latin') #'ISO-8859-1' 의 alias
    pos_df['y'] = [1] * len(pos_df)

    data = pd.concat([neg_df, pos_df], axis=0)
    data.reset_index(drop=True, inplace=True)
    print("# of Loaded Data: {}".format(data.shape[0]))
    
    return data

In [4]:
data = load_data()

# of Loaded Data: 10662


### Preprocessing

In [5]:
import re
from nltk.corpus import stopwords  

In [6]:
# 텍스트 전처리 함수 (str -> lst)
def text_preprocessor(sent):
    stop = stopwords.words('english')

    # text preprocessing
    sent_str = sent.lower()
    sent_str = re.sub('[^a-z]', ' ', sent_str)  # Remove non-alphabetic strings
    sent_str = re.sub('  ', ' ', sent_str).strip()  # Remove double white spaces
    sent_lst = [word for word in sent_str.split() if word not in stop] # Remove stopwords
    return sent_lst

In [7]:
data['X_lst'] = data['X'].map(text_preprocessor)

### Data split

In [8]:
# 데이터 분리 함수 (df -> [df,df,df])
def data_split(df, train_frac= 0.8, val_frac= 0.2, seed=123):
    train_df = df.sample(frac= train_frac, random_state= seed)
    test_df = df.drop(train_df.index)

    val_df = train_df.sample(frac= val_frac, random_state= seed)
    train_df = train_df.drop(val_df.index)
    
    print("Train_df shape:", train_df.shape)
    print("Val_df shape:", val_df.shape)
    print("Test_df shape:", test_df.shape)

    return train_df, val_df, test_df

In [9]:
train_df, val_df, test_df = data_split(data)

Train_df shape: (6824, 3)
Val_df shape: (1706, 3)
Test_df shape: (2132, 3)


### Build Vocab

In [10]:
# Vocab 만들어주는 함수 ([df,df] -> [dict, dict])
def build_vocab(train_df, val_df):
    '''train과 valid 데이터만 사용 주의'''
    trainWords_lst = [w for w_lst in train_df['X_lst'] for w in w_lst]
    ValWords_lst = [w for w_lst in val_df['X_lst'] for w in w_lst]
    totalWords_lst = trainWords_lst + ValWords_lst

    words_lst = ['<pad>', '<unk>'] + sorted(list(set(totalWords_lst)))

    itos = {idx : word for idx, word in enumerate(words_lst)}
    stoi = {word : idx for idx, word in enumerate(words_lst)}

    print('length of word_set:', len(words_lst))
    return itos, stoi

In [11]:
itos, stoi = build_vocab(train_df, val_df)

length of word_set: 16278


### String to Index (Numericalization)

In [12]:
# 문장을 숫자로 표현해주는 함수 (lst -> lst)
def token_to_idx(token_lst, stoi):
    idx_lst = []
    for w in token_lst:
        if w in stoi:
            idx = stoi[w]
        else:
            idx = stoi['<unk>']
        idx_lst.append(idx)
    return idx_lst

In [13]:
train_df['X_idx'] = train_df['X_lst'].map(lambda x: token_to_idx(x, stoi))
val_df['X_idx'] = val_df['X_lst'].map(lambda x: token_to_idx(x, stoi))
test_df['X_idx'] = test_df['X_lst'].map(lambda x: token_to_idx(x, stoi))

### Padding

In [14]:
# zero padding을 통해서 시퀀스의 길이를 맞춰주는 함수 ([lst, int, dict] -> lst)
def zero_padding(idx_lst, max_len, stoi):
    pad_idx = stoi['<pad>']
    unk_idx = stoi['<unk>']
    
    idx_lst = idx_lst[:max_len]
    
    if len(idx_lst) == max_len:
        return idx_lst
    else:
        padding_len = max_len - len(idx_lst)
        padding_list = [pad_idx] * padding_len
        idx_lst = idx_lst + padding_list
        return idx_lst

In [15]:
train_df['X_pad'] = train_df['X_idx'].map(lambda x: zero_padding(x, 10, stoi))
val_df['X_pad'] = val_df['X_idx'].map(lambda x: zero_padding(x, 10, stoi))
test_df['X_pad'] = test_df['X_idx'].map(lambda x: zero_padding(x, 10, stoi))

### torch DataLoader

In [16]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x 
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        input_x = torch.LongTensor(self.x[idx])
        target_y = torch.FloatTensor([self.y[idx]]) # 왜 FloatTensor?,torch.FloatTensor([]) 이렇게 넣어줘야하는듯.
        return input_x, target_y

In [17]:
# 기존 idx를 제거하기 위해서 list() 사용?
trainset = CustomDataset(list(train_df['X_pad']), list(train_df['y']))
validset = CustomDataset(list(val_df['X_pad']), list(val_df['y']))
testset = CustomDataset(list(test_df['X_pad']), list(test_df['y']))

In [18]:
from torch.utils.data import DataLoader

batch_size = 50
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, drop_last=False)
validloader = DataLoader(validset, batch_size=batch_size, shuffle=True, drop_last=False)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, drop_last=False)

## CNN
- In pytorch, CNN wants the batch dimension first!
- in_channels: # of channels / out_channels: # of filters / kernel_size: filter size

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Build CNN model

In [20]:
import torch.nn as nn
import torch.nn.functional as F

In [84]:
class CNN(nn.Module):
    def __init__(self, 
                 pretrained_embeddings, embedding_dim, freeze, 
                 n_filters, filter_sizes, 
                 output_dim, dropout, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze= freeze, padding_idx= pad_idx)
        
        # Note List comprehension
        self.convs = nn.ModuleList([nn.Conv2d(in_channels= 1,  # 1 channel for TEXT
                                              out_channels= n_filters,
                                              kernel_size = (fs, embedding_dim))
                                    for fs in filter_sizes])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) # Because of simply "CONCATENATE"!
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text = [batch_size, sent_len]
        embedded = self.embedding(text) 
        # embedded = [batch_size, sent_len, embed_dim]
        embedded = embedded.unsqueeze(1) # Insert 1 dimension to represent # of channels like images
        
        # embedded = [batch_size, 1, sent_len, embed_dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]  # WHY does Squeeze need here?
        
        # conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        
        # cat = [batch_size, n_filters * len(filter_sizes)]
        return self.fc(cat)

In [94]:
class Multichannel_CNN(nn.Module):
    def __init__(self, 
                 pretrained_embeddings, embedding_dim, freeze, 
                 n_filters, filter_sizes, 
                 output_dim, dropout, pad_idx):
        
        super().__init__()

        self.static_embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze= True, padding_idx= pad_idx)
        self.nonstatic_embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze= False, padding_idx= pad_idx)
        
        # Note List comprehension
        self.convs = nn.ModuleList([nn.Conv2d(in_channels= 1,  # 1 channel for TEXT
                                              out_channels= n_filters,
                                              kernel_size = (fs, embedding_dim))
                                    for fs in filter_sizes])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) # Because of simply "CONCATENATE"!
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text = [batch_size, sent_len]
        static_embedded = self.static_embedding(text) 
        nonstatic_embedded = self.nonstatic_embedding(text) 
        # embedded = [batch_size, sent_len, embed_dim]
        static_embedded = static_embedded.unsqueeze(1) # Insert 1 dimension to represent # of channels like images
        nonstatic_embedded = nonstatic_embedded.unsqueeze(1)
        
        # embedded = [batch_size, 1, sent_len, embed_dim]
        # WHY does Squeeze need here?
        conved = [F.relu(conv(static_embedded)).squeeze(3) + F.relu(conv(nonstatic_embedded)).squeeze(3) for conv in self.convs]
        
        # conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # pooled_n = [batch_size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        
        # cat = [batch_size, n_filters * len(filter_sizes)]
        return self.fc(cat)

### Load pre-trained embedding
- Load the pretrain Word2Vec model from Google [(Download HERE)](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) 
- It might take time since it  contains 300-dimensional vectors for 3 million words and phrases

In [22]:
from gensim.models.keyedvectors import KeyedVectors

In [23]:
# Load pretrained_embeddings
path = r"C:\Users\Simon\ongoing_projects\SSRC_collaboration\source\GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)
var_arr = word_vectors.vectors.var(axis=0) # For OOV initialization, 300-dim vectors containing variances of each dimension

# Customize pretrained_embeddings 
pretrained_embeddings = np.zeros((len(stoi), 300)) # <pad>, <unk>은 여기서는 제외시켜야 하나??

for i, w in enumerate(list(stoi.keys())):
    try:
        pretrained_embeddings[i] = word_vectors[w]
    except KeyError:
        pretrained_embeddings[i] = np.random.uniform(low= var_arr, high= -var_arr)  # Random sampling from Uniform dist.

In [24]:
pretrained_embeddings.shape

(16278, 300)

In [95]:
# config
PRETRAINED_EMB = torch.from_numpy(pretrained_embeddings) # torch.tensor로 전환 주의
EMBEDDING_DIM = 300
FREEZE = True
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = stoi['<pad>']

In [96]:
# model = CNN(PRETRAINED_EMB, EMBEDDING_DIM, FREEZE, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = Multichannel_CNN(PRETRAINED_EMB, EMBEDDING_DIM, FREEZE, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.float()  # Reference: https://github.com/KimythAnly/AGAIN-VC/issues/2

In [97]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,244,001 trainable parameters


### Train the model

In [98]:
import torch.optim as optim

optimizer = optim.Adadelta(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [99]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [100]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for x, y in iterator:
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(x)

        # l2-norm regularization
        # Reference: https://stackoverflow.com/questions/44641976/in-pytorch-how-to-add-l1-regularizer-to-activations
        all_fc_params = torch.cat([x.view(-1) for x in model.fc.parameters()])
        l2_regularization = torch.norm(all_fc_params, 3)
        
        cross_entropy_loss = criterion(predictions, y)
        loss = cross_entropy_loss + l2_regularization
        acc = binary_accuracy(predictions, y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [101]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x = x.to(device)
            y = y.to(device)
            
            predictions = model(x)

            # l2-norm regularization
            all_fc_params = torch.cat([x.view(-1) for x in model.fc.parameters()])
            l2_regularization = torch.norm(all_fc_params, 3)

            cross_entropy_loss = criterion(predictions, y)
            loss = cross_entropy_loss + l2_regularization
            acc = binary_accuracy(predictions, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [102]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [103]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, validloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 6s
	Train Loss: 0.725 | Train Acc: 54.43%
	 Val. Loss: 0.721 |  Val. Acc: 48.17%
Epoch: 02 | Epoch Time: 0m 6s
	Train Loss: 0.706 | Train Acc: 54.69%
	 Val. Loss: 0.710 |  Val. Acc: 47.75%
Epoch: 03 | Epoch Time: 0m 6s
	Train Loss: 0.705 | Train Acc: 60.17%
	 Val. Loss: 0.704 |  Val. Acc: 71.83%
Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 0.694 | Train Acc: 68.62%
	 Val. Loss: 0.685 |  Val. Acc: 67.45%
Epoch: 05 | Epoch Time: 0m 6s
	Train Loss: 0.659 | Train Acc: 72.94%
	 Val. Loss: 0.638 |  Val. Acc: 74.80%


### Test model performance

In [104]:
model.load_state_dict(torch.load('tut4-model.pt'))
test_loss, test_acc = evaluate(model, testloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.652 | Test Acc: 73.72%
