In [1]:
import torch

In [4]:
import re
def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

def read_data(path, label):
    ret = []
    with open(path, "r", encoding = "ISO-8859-1") as f:
        for line in f.readlines():
            ret.append([clean_str(line.replace("\n","")),label])
    return ret

In [17]:
def train_test_split(data, train_ratio = 0.9):
    import random
    import math
    import numpy
    _len = len(data)
    random.shuffle(data)
    train_data = np.array(data[:math.ceil(_len*train_ratio)])
    test_data = np.array(data[math.ceil(_len*train_ratio):])
    return train_data, test_data

In [18]:
DATA_PATHS = ["./datas/CNN_sentence/rt-polarity.neg", "./datas/CNN_sentence/rt-polarity.pos"]
NEGATIVE_DATAS = read_data(DATA_PATHS[0], 1)
POSITVIE_DATAS = read_data(DATA_PATHS[1], 0)
total_data = NEGATIVE_DATAS + POSITVIE_DATAS

train_data, test_data = train_test_split(total_data, 0.9)

## Make dataset

### 기능

#### vocabs
- vocab_dict
- itos
- stoi

#### dataset
- parameter
    - MAX_SEQ_LEN -> for padding
    - 
- in
    - dataset
- out
    - word_index, label

#### dataloader
- in
    - dataset
- out
    - batch input, batch label

In [241]:
class Vocabs:
    def __len__(self):
        return len(self.vocab_dict)
    def build_vocabs(self, sentence_list):
        from collections import defaultdict
        self.vocab_dict = defaultdict(lambda: '<UNK>')
        self.vocab_dict["<UNK>"] = 0
        self.vocab_dict["<PAD>"] = 1
        
        _index = 2
        for sentence in sentence_list:
            tokens_list = sentence.split(" ")
            for word in tokens_list:
                if word in self.vocab_dict:
                    pass
                else:
                    self.vocab_dict[word] = _index
                    _index += 1
        self.index_dict = {v:k for k, v in self.vocab_dict.items()}
    def stoi(self, sentence):
        if type(sentence) == str:
            return [self.vocab_dict[word] for word in sentence.split(" ")]
        elif type(sentence) == list:
            return [self.stoi(i) for i in sentence]

    def itos(self, indices):
        if type(indices[0]) == int :
            return " ".join([self.index_dict[index] for index in indices if self.index_dict[index] != '<PAD>'])
        elif type(indices) == list:
            return [self.itos(i) for i in indices]

In [242]:
text_vocabs = Vocabs()
text_vocabs.build_vocabs(train_data[:,0])

['it is risky , intelligent , romantic and rapturous from start to finish',
 'great story , bad idea for a movie']

In [107]:
test_x_values = text_vocabs.stoi(train_data[:10,0].tolist())
test_y_values = train_data[:10,1]
test_data = np.array([*zip(test_x_values, test_y_values)])

  test_data = np.array([*zip(test_x_values, test_y_values)])


In [86]:
text_vocabs.itos([[2, 3, 399, 642], [700, 10087, 155]])

[['it', 'is', 'very', 'good'], ['i', 'am', 'not']]

In [180]:
from torch.utils.data import Dataset, DataLoader

In [249]:
class CustomDataset(torch.utils.data.Dataset): 
    def __init__(self, np_data, max_seq_len):
        self.x_data = np_data[:,0]
        self.y_data = np_data[:,1].reshape(-1,1).astype(int)
        self.max_len = max_seq_len
        self.pad_num = 1
        super()
    def __len__(self):
        return len(self.x_data)
    def __getitem__(self, idx):
        x = self.data_cut_pad(self.x_data[idx])
        y = torch.FloatTensor(self.y_data[idx])
        return x, y
    def data_cut_pad(self, data):
        if len(data) >= self.max_len:
            data = data[:self.max_len]
        elif len(data) < self.max_len:
            data = data + [self.pad_num] * (self.max_len- len(data))
        return np.array(data)

In [250]:
test_dataset = CustomDataset(test_data, 20)

In [251]:
dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [252]:
for i_batch, sample_batched in enumerate(dataloader):
    print(sample_batched)

[tensor([[  2,  58,  75,   5,  76,  77,  65,   2,  58,  78,  12,  60,  19,  79,
          80,  81,  82,  21,  83,   1],
        [ 21,  38,   3,  39,  40,  41,  42,  43,  21,  44,  45,  12,  46,  47,
          21,  48,   3,  49,  21,  50],
        [104, 105, 106, 107, 108, 109, 110,   8, 111, 112,  19, 113,   5, 114,
          19, 115, 116, 117,  82,  21],
        [  2,  58,  84,  85,   8,  85,  12,  86,  21,  87,  42,  88,   3,  89,
          90,  49,  91,   5,  92,   2]]), tensor([[1.],
        [1.],
        [1.],
        [1.]])]
[tensor([[ 2, 58, 59, 12, 60, 21, 61, 42, 62, 63, 19, 64,  5, 65, 66, 58, 67, 42,
         68, 69],
        [ 2,  3,  4,  5,  6,  5,  7,  8,  9, 10, 11, 12, 13,  1,  1,  1,  1,  1,
          1,  1],
        [14, 15,  5, 16, 17, 18, 19, 20,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [21, 20,  3, 22, 23, 24,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1]]), tensor([[1.],
        [0.],
        [1.],
        [0.]])]
[tenso

## Sample model

In [253]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [254]:
INPUT_DIM = len(text_vocabs)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = 0
cnn_model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [255]:
for i_batch, sample_batched in enumerate(dataloader):
    print(sample_batched[0])

tensor([[ 21,  38,   3,  39,  40,  41,  42,  43,  21,  44,  45,  12,  46,  47,
          21,  48,   3,  49,  21,  50],
        [ 19,  94,   5,  95,   5,  96,  97,  98,  34,  19,  99, 100, 101,  58,
          21, 102,  34, 103,   1,   1],
        [ 14,  15,   5,  16,  17,  18,  19,  20,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1],
        [  2,   3,   4,   5,   6,   5,   7,   8,   9,  10,  11,  12,  13,   1,
           1,   1,   1,   1,   1,   1]])
tensor([[104, 105, 106, 107, 108, 109, 110,   8, 111, 112,  19, 113,   5, 114,
          19, 115, 116, 117,  82,  21],
        [  2,  58,  84,  85,   8,  85,  12,  86,  21,  87,  42,  88,   3,  89,
          90,  49,  91,   5,  92,   2],
        [ 25,  19,  26,   5,  27,  21,  28,   7,  29,   5,  25,  19,  30,  31,
          32,  21,  33,  34,  35,   8],
        [ 21,  20,   3,  22,  23,  24,   2,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1]])
tensor([[ 2, 58, 59, 12, 60, 21, 61, 42, 62, 6

In [258]:
for i_batch, sample_batched in enumerate(dataloader):
    train_x = sample_batched[0]
    train_y = sample_batched[1]
    print(cnn_model(sample_batched[0]))

tensor([[ 0.7783],
        [-0.0789],
        [-0.1006],
        [-0.3747]], grad_fn=<AddmmBackward>)
tensor([[-1.0443],
        [ 0.2030],
        [-0.0858],
        [ 0.3317]], grad_fn=<AddmmBackward>)
tensor([[ 0.1305],
        [-0.2868]], grad_fn=<AddmmBackward>)
