In [1]:
pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")
# if you downloaded the file already, do not run this line due to long time.

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x7f961e69f0a0>)

In [6]:
targetXML=open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)

# select texts between tags <content> and </content> from xml file
parse_text = '\n'.join(target_text.xpath('//content/text()'))

# remove useless expressions such as (Audio), (Laughter) and so forth in the selected text through regular expression (RE)
content_text = re.sub(r'\([^)]*\)', '', parse_text)

# tokenizing by using NLTK module
sent_text = sent_tokenize(content_text) # this command takes long time

# remove punctuation and decapitalizing
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

result = [word_tokenize(sentence) for sentence in normalized_text]

In [7]:
print('Number of Samples : {}'.format(len(result)))

Number of Samples : 273424


In [8]:
for line in result[:3]:
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


In [11]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=result, vector_size=100, window=5, min_count=5, workers=4, sg=0) 
# if this causes an error "__init__() got an unexpected keyword argument 'size'", you need to change 'size' argument name to 'vector_size'

In [12]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.8452802300453186), ('guy', 0.8233200907707214), ('lady', 0.7785226106643677), ('girl', 0.7495858669281006), ('boy', 0.747359573841095), ('soldier', 0.7439311146736145), ('gentleman', 0.7391790747642517), ('kid', 0.6955093741416931), ('poet', 0.6929811835289001), ('son', 0.6611019372940063)]


In [13]:
from gensim.models import KeyedVectors

In [14]:
model.wv.save_word2vec_format('eng_w2v') # Saving the model

In [15]:
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # 모델 로드

In [16]:
print(loaded_model.most_similar("man"))

[('woman', 0.8452802300453186), ('guy', 0.8233200907707214), ('lady', 0.7785226106643677), ('girl', 0.7495858669281006), ('boy', 0.747359573841095), ('soldier', 0.7439311146736145), ('gentleman', 0.7391790747642517), ('kid', 0.6955093741416931), ('poet', 0.6929811835289001), ('son', 0.6611019372940063)]


In [17]:
print(loaded_model.most_similar("love"))

[('hate', 0.631479799747467), ('loved', 0.5270137190818787), ('feel', 0.521802544593811), ('fear', 0.5132566094398499), ('shame', 0.4980979263782501), ('hope', 0.49258098006248474), ('hear', 0.48861509561538696), ('wish', 0.4821504056453705), ('god', 0.48164108395576477), ('joy', 0.4751511216163635)]


In [18]:
print(loaded_model.most_similar("house"))

[('seat', 0.7946961522102356), ('office', 0.7895348072052002), ('kitchen', 0.7834643125534058), ('door', 0.7716530561447144), ('apartment', 0.7694458365440369), ('chair', 0.7647400498390198), ('bedroom', 0.75508052110672), ('bed', 0.7538855671882629), ('town', 0.7537121176719666), ('hotel', 0.742728054523468)]


In [19]:
loaded_model.get_vector('love')

array([-0.2513371 , -1.1852258 , -2.037823  , -0.92945373, -1.119887  ,
        0.48500776, -0.19078965,  2.0884733 , -1.8520054 , -1.169384  ,
       -1.9707383 ,  1.702468  , -1.4082191 ,  0.76982796,  0.7572868 ,
        0.79142755, -2.034489  , -1.0864294 , -0.6660541 ,  1.3230999 ,
        1.6277913 ,  0.48980063, -1.1016436 ,  0.02638829, -1.3782709 ,
       -0.09343387, -0.48256168, -0.05681744,  1.325256  ,  0.003829  ,
        0.77014405, -0.52000445, -0.5477276 ,  1.1303731 ,  0.02138388,
       -0.63023835,  1.9960264 ,  0.5882497 , -0.84429616, -0.6025183 ,
       -0.45842642, -0.8759076 ,  0.01991908, -2.3844578 , -1.5405272 ,
        0.05063646, -0.05166181,  0.12212467,  1.0469973 , -0.7686074 ,
        0.29141042, -0.05033092,  1.102469  , -0.03409848,  0.3013139 ,
        0.16548228, -0.8330867 ,  0.8807358 , -0.8172572 , -0.93606377,
       -1.2981287 ,  0.97107536, -0.2252049 ,  0.28489292, -0.09959523,
        0.03315372,  0.5247156 , -1.783124  , -1.1618636 ,  1.60

In [20]:
model_result = loaded_model.most_similar("electrofishing")
print(model_result)

KeyError: ignored