# Import

In [2]:
!sudo apt update
!sudo apt install openjdk-17-jdk -y
!echo 'export JAVA_HOME=$(dirname $(dirname $(readlink -f $(which java))))' >> ~/.bashrc
!source ~/.bashrc

!pip install konlpy

Get:1 http://archive.ubuntu.com/ubuntu noble InRelease [256 kB]
Get:2 http://security.ubuntu.com/ubuntu noble-security InRelease [126 kB]      [0m
Get:3 http://archive.ubuntu.com/ubuntu noble-updates InRelease [126 kB]m[33m  
Get:4 http://archive.ubuntu.com/ubuntu noble-backports InRelease [126 kB]m
Get:5 http://archive.ubuntu.com/ubuntu noble/restricted amd64 Packages [117 kB]
Get:6 http://archive.ubuntu.com/ubuntu noble/multiverse amd64 Packages [331 kB]
Get:7 http://security.ubuntu.com/ubuntu noble-security/universe amd64 Packages [1194 kB]
Get:8 http://archive.ubuntu.com/ubuntu noble/main amd64 Packages [1808 kB]     [0m[33m
Get:9 http://archive.ubuntu.com/ubuntu noble/universe amd64 Packages [19.3 MB] [0m[33m
Get:10 http://security.ubuntu.com/ubuntu noble-security/multiverse amd64 Packages [33.8 kB]
Get:11 http://security.ubuntu.com/ubuntu noble-security/main amd64 Packages [1776 kB]
Get:12 http://security.ubuntu.com/ubuntu noble-security/restricted amd64 Packages [2919 kB]


In [3]:
import re
from konlpy.tag import Okt
from collections import Counter

In [4]:
text = "임금님 귀는 당나귀 귀! 임금님 귀는 당나귀 귀! 실컷~ 소리치고 나니 속이 확 뚫려 살 것 같았어."
text

'임금님 귀는 당나귀 귀! 임금님 귀는 당나귀 귀! 실컷~ 소리치고 나니 속이 확 뚫려 살 것 같았어.'

# Preprocessing

In [5]:
reg = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]")
text = reg.sub('', text)
print(text)

임금님 귀는 당나귀 귀 임금님 귀는 당나귀 귀 실컷 소리치고 나니 속이 확 뚫려 살 것 같았어


# Tokenizing

In [6]:
%%time
okt = Okt()
tokens = okt.morphs(text)
print(tokens)

['임금님', '귀', '는', '당나귀', '귀', '임금님', '귀', '는', '당나귀', '귀', '실컷', '소리', '치고', '나니', '속이', '확', '뚫려', '살', '것', '같았어']
CPU times: user 12.8 s, sys: 807 ms, total: 13.6 s
Wall time: 8.58 s


# Vocab

In [7]:
vocab = Counter(tokens)
print(vocab)

Counter({'귀': 4, '임금님': 2, '는': 2, '당나귀': 2, '실컷': 1, '소리': 1, '치고': 1, '나니': 1, '속이': 1, '확': 1, '뚫려': 1, '살': 1, '것': 1, '같았어': 1})


In [8]:
vocab['임금님']

2

In [9]:
vocab_size = 5
vocab = vocab.most_common(vocab_size)
print(vocab)

[('귀', 4), ('임금님', 2), ('는', 2), ('당나귀', 2), ('실컷', 1)]


In [10]:
word2idx = {word[0] : index + 1 for index, word in enumerate(vocab)}

In [11]:
word2idx

{'귀': 1, '임금님': 2, '는': 3, '당나귀': 4, '실컷': 5}

# One-Hot Vector

In [12]:
def one_hot_encoding(word, word2index):
    one_hot_vector = [0] * (len(word2index))
    index = word2index[word]
    one_hot_vector[index - 1] = 1
    return one_hot_vector

In [13]:
one_hot_encoding("임금님", word2idx)

[0, 1, 0, 0, 0]

# One-Hot Encoding by pytorch

In [14]:
import torch
import torch.nn.functional as F
from collections import Counter

In [15]:
text = [['강아지', '고양이', '강아지'],['애교', '고양이'], ['컴퓨터', '노트북']]
text

[['강아지', '고양이', '강아지'], ['애교', '고양이'], ['컴퓨터', '노트북']]

In [16]:
counter = Counter(word for sentence in text for word in sentence)
word_index = {word:i + 1 for i, (word, _) in enumerate(counter.most_common())}
word_index["<PAD>"] = 0

print(word_index)

{'강아지': 1, '고양이': 2, '애교': 3, '컴퓨터': 4, '노트북': 5, '<PAD>': 0}


In [17]:
vocab_size = len(word_index)

In [18]:
sub_text = ['강아지', '고양이', '강아지', '컴퓨터']
encoded = [word_index.get(word, 0) for word in sub_text]
print(encoded)

[1, 2, 1, 4]


In [19]:
encoded_tensor = torch.tensor(encoded, dtype=torch.long)
one_hot = F.one_hot(encoded_tensor, num_classes=len(word_index))
print(one_hot)

tensor([[0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0]])


# English Word2Vec

In [20]:
!pip install nltk==3.6.7
!pip install gensim==4.3.2
!pip install scipy==1.12.0 numpy==1.26.2

Collecting nltk==3.6.7
  Downloading nltk-3.6.7-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk==3.6.7)
  Downloading regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Downloading nltk-3.6.7-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (803 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.6/803.6 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nltk][32m1/2[0m [nltk]
[1A[2KSuccessfully installed nltk-3.6.7 regex-2026.1.15
Collecting gensim==4.3.2
  Downloading gensim-4.3.2.tar.gz (23.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [1]:
import nltk
nltk.download('abc')
nltk.download('punkt')

[nltk_data] Downloading package abc to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/abc.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from nltk.corpus import abc
corpus = abc.sents()

In [4]:
print(corpus[:3])

[['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', '.'], ['Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released', 'by', 'the', 'Cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['In', 'one', 'of', 'the', 'letters', 'Mr', 'Howard', 'asks', 'AWB', 'managing', 'director', 'Andrew', 'Lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'Government', 'on', 'Iraq', 'wheat', 'sales', '.']]


In [5]:
print(len(corpus))

29059


In [6]:
%%time
from gensim.models import Word2Vec

model = Word2Vec(sentences = corpus, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

CPU times: user 22.8 s, sys: 673 ms, total: 23.4 s
Wall time: 16.4 s


In [7]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.9234488606452942), ('skull', 0.911193311214447), ('Bang', 0.9056728482246399), ('asteroid', 0.9052595496177673), ('third', 0.9020780324935913), ('baby', 0.8998614549636841), ('dog', 0.8984168171882629), ('bought', 0.8974975943565369), ('rally', 0.8913778066635132), ('dinosaur', 0.8889291286468506)]


In [8]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format('~/work/word_embedding/w2v')
loaded_model = KeyedVectors.load_word2vec_format("~/work/word_embedding/w2v")

In [9]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.9234488606452942), ('skull', 0.911193311214447), ('Bang', 0.9056728482246399), ('asteroid', 0.9052595496177673), ('third', 0.9020780324935913), ('baby', 0.8998614549636841), ('dog', 0.8984168171882629), ('bought', 0.8974975943565369), ('rally', 0.8913778066635132), ('dinosaur', 0.8889291286468506)]


# Visualizing Embedding Vector

In [10]:
!python -m gensim.scripts.word2vec2tensor --input ~/work/word_embedding/w2v --output ~/work/word_embedding/w2v

2026-01-28 03:18:12,576 - word2vec2tensor - INFO - running /opt/conda/lib/python3.12/site-packages/gensim/scripts/word2vec2tensor.py --input /home/jovyan/work/word_embedding/w2v --output /home/jovyan/work/word_embedding/w2v
2026-01-28 03:18:12,576 - keyedvectors - INFO - loading projection weights from /home/jovyan/work/word_embedding/w2v
2026-01-28 03:18:13,641 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10363, 100) matrix of type float32 from /home/jovyan/work/word_embedding/w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2026-01-28T03:18:13.618904', 'gensim': '4.3.2', 'python': '3.12.11 | packaged by conda-forge | (main, Jun  4 2025, 14:45:31) [GCC 13.3.0]', 'platform': 'Linux-6.6.113+-x86_64-with-glibc2.39', 'event': 'load_word2vec_format'}
2026-01-28 03:18:14,512 - word2vec2tensor - INFO - 2D tensor file saved to /home/jovyan/work/word_embedding/w2v_tensor.tsv
2026-01-28 03:18:14,513 - word2vec2tensor - INFO - Tensor metadata file saved to /home/jovyan/w

In [12]:
# https://projector.tensorflow.org/
# step1 -> tensor.tsv
# step2 -> metadata.tsv

# FastText

In [14]:
%%time
from gensim.models import FastText
fasttext_model = FastText(corpus, window = 5, min_count = 5, workers = 4, sg = 1)

CPU times: user 1min 22s, sys: 1.64 s, total: 1min 23s
Wall time: 49.5 s


In [16]:
fasttext_model.wv.most_similar('memoryy')

[('memory', 0.9483407735824585),
 ('mechanisms', 0.8644059300422668),
 ('mechanism', 0.8637510538101196),
 ('musical', 0.8564062714576721),
 ('visual', 0.8555313348770142),
 ('basic', 0.8512289524078369),
 ('intelligence', 0.8472499251365662),
 ('technical', 0.8386043310165405),
 ('mechanical', 0.8360468149185181),
 ('yourself', 0.835903525352478)]

# GloVe

In [17]:
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-50")
glove_model.most_similar("dog")



[('cat', 0.9218004941940308),
 ('dogs', 0.8513158559799194),
 ('horse', 0.7907583713531494),
 ('puppy', 0.7754920721054077),
 ('pet', 0.7724708318710327),
 ('rabbit', 0.7720814347267151),
 ('pig', 0.7490062117576599),
 ('snake', 0.7399188876152039),
 ('baby', 0.7395570278167725),
 ('bite', 0.7387937307357788)]