In [35]:
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy
import torch
import torch.nn as nn
import torch.optim as optim

# python -m spacy download en
spacy_en = spacy.load("en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# tokenize = lambda x: x.split()
def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)] # 토큰화된 list를 반환하도록 설정

quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False, lower=False)
fields = {'quote': ('q', quote), 'score': ('s', score)} # batch.q, batch.s

In [37]:
train_data, test_data = TabularDataset.splits(path="sample_data",
                      train="train.json",
                      test="test.json",
                      #validation = "validation.json"
                      format="json",
                      fields=fields)

train_data, test_data = TabularDataset.splits(path="sample_data",
                      train="train.csv",
                      test="test.csv",
                      format="csv",
                      fields=fields)


In [38]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['q', 's'])
dict_values([['you', 'must', 'own', 'everything', 'in', 'your', 'world', '.', 'there', 'is', 'no', 'one', 'else', 'to', 'blame', '.'], '1'])


In [51]:
quote.build_vocab(train_data, max_size=100000, min_freq=1, vectors='glove.6B.100d') # vectors: pretrained 된 embedding 1GB
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=2, device=device)

In [30]:
for batch in train_iterator:
    print(batch.q) # 1은 padding
    print(batch.s)

tensor([[33, 26],
        [18, 28],
        [23,  6],
        [13, 25],
        [14, 17],
        [34,  2],
        [32, 24],
        [30,  1],
        [15,  1],
        [19,  1],
        [21,  1],
        [11,  1],
        [31,  1],
        [ 7,  1]], device='cuda:0')
tensor([1, 0], device='cuda:0')
tensor([[ 9],
        [20],
        [ 4],
        [ 3],
        [ 5],
        [10],
        [16],
        [ 4],
        [ 3],
        [29],
        [27],
        [12],
        [ 2],
        [ 8],
        [22]], device='cuda:0')
tensor([1], device='cuda:0')


In [36]:
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
        self.fc_out = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)

        embedded = self.embedding(x)
        outputs, _ = self.rnn(embedded, (h0, c0))
        prediction = self.fc_out(outputs[-1, :, :])

        return prediction

In [43]:
input_size = len(quote.vocab)
hidden_size = 512
num_layers = 2
embedding_size = 100
learning_rate = 0.005
num_epochs = 10

model = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers).to(device)

In [50]:
pretrained_embeddings = quote.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.4918,  1.1164,  1.1424,  ..., -0.5088,  0.6256,  0.4392],
        [-0.4989,  0.7660,  0.8975,  ..., -0.4118,  0.4054,  0.7850],
        [-0.5718,  0.0463,  0.8673,  ..., -0.3566,  0.9293,  0.8995]],
       device='cuda:0')

In [52]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(train_iterator):
        # Get data to cuda if possible
        data = batch.q.to(device=device)
        targets = batch.s.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores.squeeze(1), targets.type_as(scores))

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent
        optimizer.step()