# FastText

### gensim FastText

In [1]:
from gensim.models import FastText
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import pandas as pd

In [2]:
f = open('ted_en.xml', 'r', encoding='UTF-8')
xml = etree.parse(f)

contents = xml.xpath('//content/text()')    

corpus = '\n'.join(contents)

corpus = re.sub(r'\([^)]*\)', '', corpus)

sentences = sent_tokenize(corpus)

preprocessed_sentences = []
en_stopwords = stopwords.words('english')

for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)
    tokens = word_tokenize(sentence)
    tokens = [token for token in tokens if token not in en_stopwords]
    preprocessed_sentences.append(tokens)

preprocessed_sentences[:5]

FileNotFoundError: [Errno 2] No such file or directory: 'ted_en.xml'

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=preprocessed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0
)

In [None]:
w2v_model.wv.vectors.shape

(21462, 100)

In [None]:
w2v_model.wv.most_similar('father')
# w2v_model.wv.most_similar('luckyfather')
# KeyError: "Key 'luckyfather' not present in vocabulary" == OOV 이슈

[('mother', 0.9286293983459473),
 ('son', 0.924569845199585),
 ('daughter', 0.9182101488113403),
 ('husband', 0.9080780148506165),
 ('grandmother', 0.8875057697296143),
 ('sister', 0.8855710625648499),
 ('uncle', 0.868552029132843),
 ('brother', 0.8556874394416809),
 ('wife', 0.8514041900634766),
 ('grandfather', 0.8434740304946899)]

In [None]:
# FastText
from gensim.models import FastText

fasttext_model = FastText(
    sentences=preprocessed_sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0
)

In [None]:
fasttext_model.wv.vectors.shape

(21462, 100)

In [None]:
fasttext_model.wv.most_similar('father')

[('godfather', 0.9512758851051331),
 ('mother', 0.9480866193771362),
 ('grandfather', 0.9404357075691223),
 ('brother', 0.9326821565628052),
 ('grandmother', 0.9269680976867676),
 ('luther', 0.9109013080596924),
 ('bother', 0.8854381442070007),
 ('brotherhood', 0.8767275810241699),
 ('motherly', 0.8684408068656921),
 ('motherhood', 0.8657189607620239)]

In [None]:
fasttext_model.wv.most_similar('luckyfather')

[('father', 0.9735187888145447),
 ('godfather', 0.9538674354553223),
 ('mother', 0.9263980388641357),
 ('grandfather', 0.9261943101882935),
 ('luther', 0.9215258359909058),
 ('brother', 0.9141948819160461),
 ('grandmother', 0.9050705432891846),
 ('bother', 0.896907389163971),
 ('brotherhood', 0.8736904263496399),
 ('motherhood', 0.8616334199905396)]

In [None]:
# OOV 단어도 subword 기반으로 검색해 vector 반환
print(fasttext_model.wv['luckyfather'])
print(fasttext_model.wv['father'])

[-0.23447931 -0.36477354 -1.0040406   0.62194085  0.34875965  1.0513027
 -0.70366913 -0.21046147 -0.3624668  -0.17548466 -0.06402272 -0.05418586
 -0.38017932  0.25167167 -0.02686726 -0.37934443  0.4256372   0.2655972
  0.09738821  0.39603195 -0.33067214 -0.17569494  0.3323578  -0.38105196
 -0.2876757   0.22417073 -0.31766826  0.10996711 -0.19049256 -0.51152176
 -0.3658422  -0.7764174   0.6516797   0.00164331  0.22516225  0.66507936
 -0.19990721  0.48614463  0.21054737  0.35412112 -0.1487462   0.16282551
 -0.18668695 -0.10410398 -0.07284244  0.6115415   0.25510898 -0.6156224
  0.70832646  0.49720758  0.1748191   0.22940274  0.45895284 -0.5175231
  0.4854679   0.43078333 -0.4334741   0.711079   -0.37983966  0.08228604
  0.01552594  0.06502179 -0.67521346  0.36575395 -0.38858664  0.10255115
 -0.31841937 -0.2037342   0.48403463  0.42028475  0.39108366 -0.19155228
 -0.23603435 -0.17268986  0.24313249  0.08345649  0.54350686 -0.28051534
 -0.80115265  0.15153067 -0.1035024   0.11208391 -0.516

### fasttext 패키지 활용

In [None]:
!pip install fasttext-wheel

Collecting fasttext-wheel
  Downloading fasttext_wheel-0.9.2-cp312-cp312-win_amd64.whl.metadata (16 kB)
Collecting pybind11>=2.2 (from fasttext-wheel)
  Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Downloading fasttext_wheel-0.9.2-cp312-cp312-win_amd64.whl (234 kB)
Downloading pybind11-3.0.1-py3-none-any.whl (293 kB)
Installing collected packages: pybind11, fasttext-wheel

   ---------------------------------------- 2/2 [fasttext-wheel]

Successfully installed fasttext-wheel-0.9.2 pybind11-3.0.1


In [None]:
import fasttext

model = fasttext.train_unsupervised(
    'naver_movie_ratings.txt',
    model='skipgram',
    minCount=1,
    dim=100,
    minn=3, # subword 최소 ngram
    maxn=5  # subword 최대 ngram
)

In [None]:
model.get_word_vector('극장')

array([ 0.49322352, -0.37126443, -0.48847747,  0.8331983 , -0.31740275,
        0.07893787, -0.15449272,  0.01560479,  0.14578608,  0.61734337,
        0.06154922,  0.4090892 ,  0.4815861 , -0.31416428, -0.7134507 ,
       -1.0247767 , -0.00815904, -1.4249343 , -0.480567  , -0.18979743,
        0.49130914,  0.34820208,  0.28475577, -0.06420416,  0.5830802 ,
       -0.86361116, -0.18150797,  0.4902143 , -0.878152  ,  0.35613424,
       -0.20119587,  0.45832473, -0.01014074,  0.30513126, -0.58922243,
        0.23991   ,  0.05718429,  0.36492056, -0.37928203, -0.2873546 ,
        0.04461949,  0.29814065,  0.33704758, -0.27334696,  1.0886282 ,
       -0.15446392,  0.2065637 , -0.49115038, -0.11837739,  0.27551895,
        0.27340886, -0.82777816, -0.13717332,  0.4777568 , -0.7063349 ,
        0.11764389,  0.7402754 ,  0.27876064,  1.0882809 , -0.1219985 ,
        0.44595551,  0.02715196,  0.20694724,  0.47467077, -0.01156405,
       -0.04190172, -0.06251285,  0.00541051,  0.3882863 , -0.00

In [None]:
model.get_subwords('영화관')

(['영화관', '<영화', '<영화관', '<영화관>', '영화관', '영화관>', '화관>'],
 array([   2062, 1921845, 1442415, 1378913, 2245977, 1515139, 1352938]))