<a href="https://colab.research.google.com/github/HANE48/colab_Deep_learining/blob/main/CBOW_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

class CBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.W1 = np.random.randn(embedding_dim, vocab_size) * 0.01
        self.W2 = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    def forward(self, x):
        h = np.dot(self.W1, x)
        u = np.dot(self.W2, h)
        y_pred = softmax(u)
        return y_pred, h

    def backward(self, x, y_true, h, y_pred, learning_rate):
        dl_du = y_pred - y_true
        dl_dw2 = np.outer(dl_du, h)
        dl_dh = np.dot(self.W2.T, dl_du)
        dl_dw1 = np.outer(dl_dh, x)

        self.W1 -= learning_rate * dl_dw1
        self.W2 -= learning_rate * dl_dw2

    def train(self, data, word_to_index, context_size, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for context, target in data:
                x = np.mean([self.word_to_one_hot(word_to_index[word]) for word in context], axis=0)
                y_true = self.word_to_one_hot(word_to_index[target])
                y_pred, h = self.forward(x)
                self.backward(x, y_true, h, y_pred, learning_rate)
                loss -= np.log(y_pred[word_to_index[target]])
            if (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}, Loss: {loss}")

    def word_to_one_hot(self, word_idx):
        x = np.zeros(self.vocab_size)
        x[word_idx] = 1
        return x

    def get_similar_words(self, word, word_to_index, index_to_word, top_n=5):
        if word not in word_to_index:
            return "Word not found in dictionary."

        # 단어의 임베딩 벡터를 가져옵니다.
        word_idx = word_to_index[word]
        word_vec = self.W1[:, word_idx]

        # 모든 단어에 대해 코사인 유사도를 계산합니다.
        word_sim = np.dot(self.W1.T, word_vec) / (np.linalg.norm(self.W1, axis=0) * np.linalg.norm(word_vec))

        # 가장 유사한 단어의 인덱스를 가져옵니다.
        similar_word_indices = np.argsort(-word_sim)[1:top_n+1]  # 자기 자신을 제외

        # 유사한 단어와 그 유사도를 반환합니다.
        similar_words = [(index_to_word[i], word_sim[i]) for i in similar_word_indices]
        return similar_words

# 코퍼스와 사전 처리
corpus = ["apple banana apple", "banana orange fruit", "orange banana apple", "dog cat animal", "cat monkey animal"]
words = set(" ".join(corpus).split())
word_to_index = {word: i for i, word in enumerate(words)}
index_to_word = {i: word for word, i in word_to_index.items()}

# 모델 초기화 및 학습
vocab_size = len(word_to_index)
embedding_dim = 10
model = CBOW(vocab_size, embedding_dim)
data = [(["banana", "apple"], "apple"), (["banana", "orange"], "fruit"), (["orange", "apple"], "banana"), (["dog", "animal"], "cat"), (["cat", "animal"], "monkey")]
model.train(data, word_to_index, context_size=2, epochs=1500, learning_rate=0.01)

# 단어 유사도 검색
word_input = input("Enter a word to find similar words: ")
similar_words = model.get_similar_words(word_input, word_to_index, index_to_word)
print("Similar words:", similar_words)


Epoch 100, Loss: 10.391829993565135
Epoch 200, Loss: 10.368270623236645
Epoch 300, Loss: 10.237436793741884
Epoch 400, Loss: 9.565418124197057
Epoch 500, Loss: 7.6244901280338295
Epoch 600, Loss: 5.83534642630846
Epoch 700, Loss: 4.888529836964724
Epoch 800, Loss: 4.012892970178593
Epoch 900, Loss: 2.8951604712494032
Epoch 1000, Loss: 1.7700609416932707
Epoch 1100, Loss: 1.0134933702582143
Epoch 1200, Loss: 0.617358608948871
Epoch 1300, Loss: 0.4129290930754703
Epoch 1400, Loss: 0.29839472993764493
Epoch 1500, Loss: 0.22831529606783246
Enter a word to find similar words: apple
Similar words: [('banana', 0.1415753048238943), ('monkey', 0.12103105772741465), ('orange', 0.10377409019781453), ('dog', -0.15435225487791943), ('fruit', -0.1774894948600577)]
