In [1]:
import sys
sys.path.append('..')
import numpy as np

In [2]:
def preprocessing(text):
    
    text = text.lower() # 모든 문자를 소문자로 바꿔주는 과정
    text = text.replace(".", " .")
    words = text.split()
    word_to_id = {}
    id_to_word = {}
    
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
        
    corpus = np.array([word_to_id[w] for w in words])
    
    return corpus, word_to_id, id_to_word

In [3]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    
    return co_matrix

In [10]:
def cos_similarity(x, y, eps=1e-8):
    nx = x / np.sqrt(np.sum(x**2) + eps)
    ny = y / np.sqrt(np.sum(y**2) + eps)
    
    return np.dot(nx, ny)

In [4]:
text = "you say goodbye and i say hello."

In [5]:
corpus, word_to_id, id_to_word = preprocessing(text)

In [6]:
print(corpus)
print(word_to_id)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [7]:
vocab_size = len(word_to_id)
co_matrix = create_co_matrix(corpus, vocab_size=vocab_size)

In [8]:
print(co_matrix)

[[0 1 0 0 0 0 0]
 [1 0 1 0 1 1 0]
 [0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0]
 [0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0]]


In [12]:
C = co_matrix

In [13]:
c0 = C[word_to_id["you"]]
c1 = C[word_to_id["i"]]
print(c0)
print(c1)

[0 1 0 0 0 0 0]
[0 1 0 1 0 0 0]


In [14]:
print(cos_similarity(c0, c1))

0.7071067758832467
