In [1]:
import numpy as np
from collections import Counter
import gensim

sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [2]:
tokenized_sentences = [sent.split() for sent in sentences]
print('단어 토큰화 된 결과 :', tokenized_sentences)

단어 토큰화 된 결과 : [['nice', 'great', 'best', 'amazing'], ['stop', 'lies'], ['pitiful', 'nerd'], ['excellent', 'work'], ['supreme', 'quality'], ['bad'], ['highly', 'respectable']]


In [3]:
word_list = []
for sent in tokenized_sentences:
    for word in sent:
        word_list.append(word)

word_counts = Counter(word_list)
print('총 단어수: ', len(word_counts))

총 단어수:  15


In [4]:
vocab = sorted(word_counts, key = word_counts.get, reverse = True)
print(vocab)

['nice', 'great', 'best', 'amazing', 'stop', 'lies', 'pitiful', 'nerd', 'excellent', 'work', 'supreme', 'quality', 'bad', 'highly', 'respectable']


In [5]:
word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
for index, word in enumerate(vocab):
    word_to_index[word] = index + 2
vocab_size = len(word_to_index)
print('Padding Token, UNK Token을 고려한 단어 집합의 크기: ', vocab_size)

Padding Token, UNK Token을 고려한 단어 집합의 크기:  17


In [6]:
print(word_to_index)

{'<PAD>': 0, '<UNK>': 1, 'nice': 2, 'great': 3, 'best': 4, 'amazing': 5, 'stop': 6, 'lies': 7, 'pitiful': 8, 'nerd': 9, 'excellent': 10, 'work': 11, 'supreme': 12, 'quality': 13, 'bad': 14, 'highly': 15, 'respectable': 16}


In [7]:
def texts_to_sequences(tokenized_X_data, word_to_index):
    encoded_X_data = []
    for sent in tokenized_X_data:
        index_sequences = []
        for word in sent:
            try:
                index_sequences.append(word_to_index[word])
            except KeyError:
                index_sequences.append(word_to_index[''])
        encoded_X_data.append(index_sequences)
    return encoded_X_data

X_encoded = texts_to_sequences(tokenized_sentences, word_to_index)
print(X_encoded)

[[2, 3, 4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14], [15, 16]]


In [8]:
max_len = max(len(l) for l in X_encoded)
print('Max len :', max_len)

Max len : 4


In [9]:
def pad_sequences(sentences, max_len):
    features = np.zeros((len(sentences), max_len), dtype = int)
    for index, sentence in enumerate(sentences):
        if len(sentence) != 0:
            features[index, :len(sentence)] = np.array(sentence)[:max_len]
    return features

X_train = pad_sequences(X_encoded, max_len = max_len)
y_train = np.array(y_train)
print('Padding Output :')
print(X_train)

Padding Output :
[[ 2  3  4  5]
 [ 6  7  0  0]
 [ 8  9  0  0]
 [10 11  0  0]
 [12 13  0  0]
 [14  0  0  0]
 [15 16  0  0]]


In [15]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [16]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim * max_len, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        flattened = self.flatten(embedded)
        output = self.fc(flattened)
        return self.sigmoid(output)

In [17]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
embedding_dim = 100
simple_model = SimpleModel(vocab_size, embedding_dim).to(device)

In [18]:
criterion = nn.BCELoss()
optimizer = Adam(simple_model.parameters())

In [19]:
train_dataset = TensorDataset(torch.tensor(X_train, dtype = torch.long), torch.tensor(y_train, dtype = torch.float32))
train_dataloader = DataLoader(train_dataset, batch_size = 2)
print(len(train_dataloader))

4


In [20]:
for epoch in range(10):
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = simple_model.forward(inputs).view(-1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.9972531199455261
Epoch 2, Loss: 0.7904385328292847
Epoch 3, Loss: 0.599193811416626
Epoch 4, Loss: 0.4577890932559967
Epoch 5, Loss: 0.36288413405418396
Epoch 6, Loss: 0.30181896686553955
Epoch 7, Loss: 0.2624887526035309
Epoch 8, Loss: 0.23584935069084167
Epoch 9, Loss: 0.21577180922031403
Epoch 10, Loss: 0.19846446812152863


In [21]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1Av37IVBQAAntSe1X3MOAl5gvowQzd2_j

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
zsh:1: no matches found: https://drive.google.com/uc?id=1Av37IVBQAAntSe1X3MOAl5gvowQzd2_j


In [23]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
embedding_matrix = np.zeros((vocab_size, 300))
print('Embedding Matrix Size: ', embedding_matrix.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin.gz'