<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/NLP/4-4-glove_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GloVe (Global Vectors for Word Representation) 예제

이 예제에서는 사전에 훈련된 GloVe 임베딩을 사용하여 텍스트 데이터의 단어 벡터를 불러오고 활용하는 방법을 보여줍니다. `GloVe` 자체를 직접 학습시키려면 매우 큰 말뭉치와 상당한 연산 자원이 필요하지만, 우리는 여기서 이미 훈련된 `GloVe` 벡터를 다운로드해 사용하는 방법을 살펴봅니다.

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# GloVe 임베딩 파일 경로 설정 (glove.6B.100d.txt 다운로드 후 경로 지정)
glove_file_path = 'glove.6B.100d.txt' # If you downloaded the file to the same directory as the script

# GloVe 임베딩을 로드하는 함수
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]  # 첫 번째 요소는 단어
            vector = np.asarray(values[1:], dtype='float32')  # 나머지는 벡터 값
            embeddings[word] = vector
    return embeddings

# GloVe 임베딩 로드
glove_embeddings = load_glove_embeddings(glove_file_path)

--2024-10-01 14:26:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-10-01 14:26:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-10-01 14:26:57--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# GloVe 임베딩 파일 경로 설정 (glove.6B.100d.txt 다운로드 후 경로 지정)
glove_file_path = 'glove.6B.100d.txt' # If you downloaded the file to the same directory as the script

# GloVe 임베딩을 로드하는 함수
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]  # 첫 번째 요소는 단어
            vector = np.asarray(values[1:], dtype='float32')  # 나머지는 벡터 값
            embeddings[word] = vector
    return embeddings

# GloVe 임베딩 로드
glove_embeddings = load_glove_embeddings(glove_file_path)

## 단어 벡터 얻기
특정 단어의 벡터를 가져올 수 있습니다.

In [5]:
# 단어 벡터 얻기
def get_word_vector(word, embeddings):
    if word in embeddings:
        return embeddings[word]
    else:
        return f"'{word}' not found in GloVe embeddings."

# 예제: 'king'의 벡터 얻기
king_vector = get_word_vector('king', glove_embeddings)
print(f"Vector for 'king': {king_vector}")

Vector for 'king': [-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881


## 단어 간 유사도 측정
코사인 유사도를 사용하여 두 단어 간의 유사도를 계산합니다.

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity # Make sure to import the cosine_similarity function from sklearn

# GloVe 임베딩 파일 경로 설정 (glove.6B.100d.txt 다운로드 후 경로 지정)
glove_file_path = 'glove.6B.100d.txt' # If you downloaded the file to the same directory as the script

# GloVe 임베딩을 로드하는 함수
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]  # 첫 번째 요소는 단어
            vector = np.asarray(values[1:], dtype='float32')  # 나머지는 벡터 값
            embeddings[word] = vector
    return embeddings

# GloVe 임베딩 로드
glove_embeddings = load_glove_embeddings(glove_file_path)

# 단어 벡터 얻기
def get_word_vector(word, embeddings):
    if word in embeddings:
        return embeddings[word]
    else:
        return f"'{word}' not found in GloVe embeddings."

# 예제: 'king'의 벡터 얻기
king_vector = get_word_vector('king', glove_embeddings)
print(f"Vector for 'king': {king_vector}")

# 단어 유사도 측정
def word_similarity(word1, word2, embeddings): # Changed function name to word_similarity to avoid conflict
    if word1 in embeddings and word2 in embeddings:
        vec1 = embeddings[word1].reshape(1, -1)
        vec2 = embeddings[word2].reshape(1, -1)
        return cosine_similarity(vec1, vec2)[0][0] # This now refers to the cosine_similarity function from sklearn
    else:
        return None

# 예제: 'king'과 'queen'의 유사도 측정
similarity = word_similarity('king', 'queen', glove_embeddings) # Call the new function name
print(f"Similarity between 'king' and 'queen': {similarity:.4f}")

Vector for 'king': [-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881


## 유사한 단어 찾기
주어진 단어와 가장 유사한 단어를 찾습니다.

In [None]:
# 유사한 단어 찾기
def find_most_similar(word, embeddings, top_n=5):
    if word not in embeddings:
        return f"'{word}' not found in GloVe embeddings."

    word_vec = embeddings[word].reshape(1, -1)
    similarities = {}

    for other_word, other_vec in embeddings.items():
        if other_word != word:
            other_vec = other_vec.reshape(1, -1)
            similarity = word_similarity(word, other_word, embeddings) # Call word_similarity to calculate cosine similarity between word vectors
            similarities[other_word] = similarity

    # 유사도에 따라 단어를 정렬하고 상위 N개를 반환
    most_similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return most_similar_words

# 예제: 'king'과 유사한 단어 찾기
similar_words = find_most_similar('king', glove_embeddings, top_n=5)
print(f"Words most similar to 'king': {similar_words}")

## 단어 벡터 연산
GloVe 벡터를 사용해 간단한 벡터 산술 연산을 수행해 봅니다.

In [None]:
# 단어 벡터 연산 예제
def vector_arithmetic(word1, word2, word3, embeddings):
    if word1 in embeddings and word2 in embeddings and word3 in embeddings:
        result_vector = embeddings[word1] - embeddings[word2] + embeddings[word3]
        similarities = {}
        for other_word, other_vec in embeddings.items():
            other_vec = other_vec.reshape(1, -1)
            result_vec = result_vector.reshape(1, -1)
            similarity = cosine_similarity(other_vec, result_vec)
            similarities[other_word] = similarity

        # 결과 벡터와 가장 유사한 단어 찾기
        most_similar_word = max(similarities, key=similarities.get)
        return most_similar_word
    else:
        return None

# 예제: 'king' - 'man' + 'woman'과 가장 유사한 단어 찾기
result = vector_arithmetic('king', 'man', 'woman', glove_embeddings)
print(f"Result of 'king' - 'man' + 'woman': {result}")