In [1]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

words = np.array(["자연어", "처리", "딥러닝", "머신러닝"]).reshape(-1, 1)
encoder = OneHotEncoder(sparse_output=False)
one_hot = encoder.fit_transform(words)

print("✅ 원-핫 인코딩 결과:")
print(one_hot)

✅ 원-핫 인코딩 결과:
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [2]:
# -*- coding: utf-8 -*-

from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 학습용 텍스트 파일 직접 지정
input_file = 'text8.txt'  # 학습에 사용할 텍스트 파일 경로
output_model = 'word2vec.model'  # 저장할 모델 경로

sentences = word2vec.LineSentence(input_file)

model = word2vec.Word2Vec(sentences,
                          vector_size=100,  # size -> vector_size
                          min_count=1,
                          window=10
                          )

model.save(output_model)
print("✅ Word2Vec 모델이 성공적으로 저장되었습니다.")


2025-05-23 14:07:33,916 : INFO : collecting all words and their counts
2025-05-23 14:07:36,027 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-23 14:07:40,397 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2025-05-23 14:07:40,397 : INFO : Creating a fresh vocabulary
2025-05-23 14:07:41,503 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 253854 unique words (100.00% of original 253854, drops 0)', 'datetime': '2025-05-23T14:07:41.503080', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-05-23 14:07:41,503 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 17005207 word corpus (100.00% of original 17005207, drops 0)', 'datetime': '2025-05-23T14:07:41.503080', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anacond

✅ Word2Vec 모델이 성공적으로 저장되었습니다.


In [3]:
from gensim.models import word2vec

# 모델 파일 불러오기
model = word2vec.Word2Vec.load('word2vec.model')  # 모델 파일 로드

# 유사한 단어 찾기
results = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

# 결과 출력
for result in results:
    print(result[0], '\t', result[1])


2025-05-23 14:08:46,672 : INFO : loading Word2Vec object from word2vec.model
2025-05-23 14:08:46,748 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2025-05-23 14:08:46,758 : INFO : loading vectors from word2vec.model.wv.vectors.npy with mmap=None
2025-05-23 14:08:46,813 : INFO : loading syn1neg from word2vec.model.syn1neg.npy with mmap=None
2025-05-23 14:08:46,864 : INFO : setting ignored attribute cum_table to None
2025-05-23 14:08:48,670 : INFO : Word2Vec lifecycle event {'fname': 'word2vec.model', 'datetime': '2025-05-23T14:08:48.670369', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'loaded'}


queen 	 0.7008485198020935


In [5]:
from konlpy.tag import Okt  # Twitter 대신 Okt 사용
tagger = Okt()

def tokenize(doc):
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]

sentences = tokenize(u'이것도 되나욕 ㅋㅋㅋ')
print('1st result:', sentences)

noun_adv_verb_only_list = [word.split("/")[0] for word in sentences if word.split("/")[1] == "Verb" or word.split("/")[1] == "Adjective"
                          or word.split("/")[1] == "Noun"]
print('2st result:', noun_adv_verb_only_list)


1st result: ['이/Determiner', '것/Noun', '도/Josa', '되다/Verb', '욕/Noun', 'ㅋㅋㅋ/KoreanParticle']
2st result: ['것', '되다', '욕']


In [6]:
from tqdm import tqdm
from gensim.models import Word2Vec
from konlpy.tag import Okt
import codecs

# 데이터 읽기
def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]  # header 제외
    return data

ratings_train = read_data('ratings_train.txt')

okt = Okt()

# 토큰화 함수
def tokens(doc):
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

# 리뷰 텍스트만 추출
docs = [row[1] for row in ratings_train]

# 토큰화 + tqdm
data = [tokens(d) for d in tqdm(docs, desc="Tokenizing")]

# Word2Vec 학습 + tqdm
w2v_model = Word2Vec(tqdm(data, desc="Training Word2Vec"), vector_size=100, window=5, min_count=5, workers=4)

# 학습된 모델 저장
w2v_model.save('word2vec_trained.model')

# 모델 다시 불러오기
model = Word2Vec.load('word2vec_trained.model')

# 가장 유사한 단어 출력
print(w2v_model.wv.most_similar(positive=tokens(u'남자 여배우'), negative=tokens(u'배우'), topn=1))


Tokenizing: 100%|██████████| 150000/150000 [09:54<00:00, 252.12it/s]
Training Word2Vec:   0%|          | 0/150000 [00:00<?, ?it/s]2025-05-23 14:20:59,280 : INFO : collecting all words and their counts
2025-05-23 14:20:59,280 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-23 14:20:59,320 : INFO : PROGRESS: at sentence #10000, processed 146795 words, keeping 13496 word types
2025-05-23 14:20:59,372 : INFO : PROGRESS: at sentence #20000, processed 289797 words, keeping 19101 word types
Training Word2Vec:  14%|█▍        | 21058/150000 [00:00<00:00, 206829.53it/s]2025-05-23 14:20:59,411 : INFO : PROGRESS: at sentence #30000, processed 435241 words, keeping 23273 word types
2025-05-23 14:20:59,447 : INFO : PROGRESS: at sentence #40000, processed 582173 words, keeping 26843 word types
Training Word2Vec:  33%|███▎      | 49276/150000 [00:00<00:00, 249667.97it/s]2025-05-23 14:20:59,481 : INFO : PROGRESS: at sentence #50000, processed 725793 words, keeping 298

[('여자/Noun', 0.8186426758766174)]


In [29]:
print(model.wv.most_similar(positive=tokenize(u'정우성 조인성'), topn=10))

[('전도연/Noun', 0.901394248008728), ('임수정/Noun', 0.9000663161277771), ('패닝/Noun', 0.8982387185096741), ('엄태웅/Noun', 0.8831029534339905), ('박용우/Noun', 0.8796902894973755), ('문근영/Noun', 0.879190981388092), ('고현정/Noun', 0.8754245638847351), ('하비에르/Noun', 0.8725006580352783), ('김민준/Noun', 0.8712255954742432), ('테론/Noun', 0.8706749081611633)]


In [8]:
def get_sentence_vector(sentence):
    
    tokens = [w.split('/')[0] for w in tagger.pos(sentence, norm=True, stem=True) if w.split('/')[1] in ['Noun', 'Adjective', 'Verb']]

    vectors = [model.wv[word] for word in tokens if word in model.wv]

    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [17]:
docs_2 = ["나는 NLP를 공부하고 있다.",
"자연어 처리는 인공지능의 중요한 분야이다.",
"딥러닝과 머신러닝을 활용하여 NLP 연구를 진행한다.",
"컴퓨터는 인간의 언어를 이해할 수 있을까?",
"한국어 NLP는 영어보다 어려운 점이 많다.",
]

data_2 = [tokens(d) for d in tqdm(docs_2, desc="Tokenizing_second")]

w2v_model = Word2Vec(tqdm(data_2, desc="Training Word2Vec_second"), vector_size=100, window=3, sg=1, min_count=1)

# 학습된 모델 저장
w2v_model.save('word2vec_trained_2.model')

# 모델 다시 불러오기
model_2 = Word2Vec.load('word2vec_trained_2.model')

Tokenizing_second: 100%|██████████| 5/5 [00:00<00:00, 427.70it/s]
Training Word2Vec_second:   0%|          | 0/5 [00:00<?, ?it/s]2025-05-23 14:26:38,452 : INFO : collecting all words and their counts
2025-05-23 14:26:38,453 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
Training Word2Vec_second: 100%|██████████| 5/5 [00:00<00:00, 1609.97it/s]
2025-05-23 14:26:38,456 : INFO : collected 40 word types from a corpus of 53 raw words and 5 sentences
2025-05-23 14:26:38,458 : INFO : Creating a fresh vocabulary
2025-05-23 14:26:38,460 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 40 unique words (100.00% of original 40, drops 0)', 'datetime': '2025-05-23T14:26:38.460114', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-05-23 14:26:38,460 : INFO : Word2Vec lifecycle event {'msg': 'effective_

In [34]:
try:
    text = input("Enter a sentence: ")
    print("입력한 문장:", text)
    print(model_2.wv.most_similar(positive=tokenize(text), topn=5))
except KeyError:
    print("해당 단어는 학습되지 않았습니다.")

입력한 문장: 자연어
[('./Punctuation', 0.21917827427387238), ('를/Josa', 0.1748206466436386), ('있다/Adjective', 0.1646600067615509), ('늘다/Verb', 0.14230895042419434), ('러닝/Noun', 0.10904104262590408)]


In [37]:
print(model.wv.most_similar(positive=tokenize(u'왕 여자'),negative=tokenize(u'남자'), topn=5))

[('박보영/Noun', 0.6900759339332581), ('하지원/Noun', 0.6764721274375916), ('테론/Noun', 0.6622906923294067), ('곽지민/Noun', 0.6491292715072632), ('신세경/Noun', 0.6464618444442749)]
