<a href="https://colab.research.google.com/github/IlyaGalyukshev/colab/blob/main/NLP3_Gensim_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec


import pandas as pd
import nltk
import re
import gensim
import os
from bs4 import BeautifulSoup

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**BagOfWords (BOW
)**

In [11]:
documents = ['she was amazed by the large chunks of ice washing up on the beach',
             'when nobody is around, the trees gossip about the people who have walked under them',
             'the lake is a long way from here']

texts = [[text for text in doc.split()] for doc in documents]
dictionary = corpora.Dictionary(texts)

print(dictionary)
print(dictionary.token2id)

Dictionary<32 unique tokens: ['amazed', 'beach', 'by', 'chunks', 'ice']...>
{'amazed': 0, 'beach': 1, 'by': 2, 'chunks': 3, 'ice': 4, 'large': 5, 'of': 6, 'on': 7, 'she': 8, 'the': 9, 'up': 10, 'was': 11, 'washing': 12, 'about': 13, 'around,': 14, 'gossip': 15, 'have': 16, 'is': 17, 'nobody': 18, 'people': 19, 'them': 20, 'trees': 21, 'under': 22, 'walked': 23, 'when': 24, 'who': 25, 'a': 26, 'from': 27, 'here': 28, 'lake': 29, 'long': 30, 'way': 31}


In [14]:
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in documents)

print(dictionary)
print(dictionary.token2id)

Dictionary<31 unique tokens: ['amazed', 'beach', 'by', 'chunks', 'ice']...>
{'amazed': 0, 'beach': 1, 'by': 2, 'chunks': 3, 'ice': 4, 'large': 5, 'of': 6, 'on': 7, 'she': 8, 'the': 9, 'up': 10, 'was': 11, 'washing': 12, 'about': 13, 'around': 14, 'gossip': 15, 'have': 16, 'is': 17, 'nobody': 18, 'people': 19, 'them': 20, 'trees': 21, 'under': 22, 'walked': 23, 'when': 24, 'who': 25, 'from': 26, 'here': 27, 'lake': 28, 'long': 29, 'way': 30}


In [17]:
!wget https://raw.githubusercontent.com/ancatmara/data-science-nlp/master/data/w2v/train/unlabeledTrainData.tsv

--2024-03-02 10:48:42--  https://raw.githubusercontent.com/ancatmara/data-science-nlp/master/data/w2v/train/unlabeledTrainData.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 67281491 (64M) [text/plain]
Saving to: ‘unlabeledTrainData.tsv’


2024-03-02 10:48:45 (286 MB/s) - ‘unlabeledTrainData.tsv’ saved [67281491/67281491]



In [18]:
data = pd.read_csv('/content/unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
data

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."
...,...,...
49995,"""18984_0""","""The original Man Eater by Joe D'Amato is some..."
49996,"""16433_0""","""When Home Box Office was in it's early days m..."
49997,"""16006_0""","""Griffin Dunne was born into a cultural family..."
49998,"""40155_0""","""Not a bad story, but the low budget rears its..."


In [22]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [29]:
def review_to_wordlist(review, remove_stopwords=False):
  review = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", review)
  review_text = BeautifulSoup(review, 'lxml').get_text()
  review_text = re.sub('[^a-zA-Z]', ' ', review_text)
  words = review_text.lower().split()
  if remove_stopwords:
    stops = stopwords.words('english')
    words = [w for w in words if w not in stops]
  return words

In [26]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
  raw_sentences = tokenizer.tokenize(review.strip())
  sentences = []
  for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
      sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
  return sentences

In [30]:
from tqdm import tqdm

sentences = []

for review in tqdm(data['review']):
  sentences += review_to_sentences(review, tokenizer)

  review_text = BeautifulSoup(review, 'lxml').get_text()
100%|██████████| 50000/50000 [02:56<00:00, 283.40it/s]


In [31]:
print(len(sentences))
print(sentences[0])

529416
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends']


In [32]:
documents = ['Siri became confused when we reused to follow her directions.',
             'The Guinea fowl flies through the air with all the grace of a turtle.',
             'His get rich quick scheme was to grow a cactus farm.']

tokenized_list = [simple_preprocess(doc) for doc in documents]
dictionary = corpora.Dictionary()

bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_list]

print(bow_corpus)
print(dictionary.token2id)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 1), (19, 1), (20, 1)], [(7, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]]
{'became': 0, 'confused': 1, 'directions': 2, 'follow': 3, 'her': 4, 'reused': 5, 'siri': 6, 'to': 7, 'we': 8, 'when': 9, 'air': 10, 'all': 11, 'flies': 12, 'fowl': 13, 'grace': 14, 'guinea': 15, 'of': 16, 'the': 17, 'through': 18, 'turtle': 19, 'with': 20, 'cactus': 21, 'farm': 22, 'get': 23, 'grow': 24, 'his': 25, 'quick': 26, 'rich': 27, 'scheme': 28, 'was': 29}


**Word2Vec**

In [38]:
api_info = api.info()

print(*api_info)

corpora models


In [41]:
corpora = api_info['corpora']
pd.DataFrame(corpora)

Unnamed: 0,semeval-2016-2017-task3-subtaskBC,semeval-2016-2017-task3-subtaskA-unannotated,patent-2017,quora-duplicate-questions,wiki-english-20171001,text8,fake-news,20-newsgroups,__testing_matrix-synopsis,__testing_multipart-matrix-synopsis
num_records,-1,189941,353197,404290,4924894,1701,12999,18846,,
record_format,dict,dict,dict,dict,dict,list of str (tokens),dict,dict,,
file_size,6344358,234373151,3087262469,21684784,6516051717,33182058,20102776,14483581,,
reader_code,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,,
license,All files released for the task are free for g...,These datasets are free for general research use.,not found,probably https://www.quora.com/about/tos,https://dumps.wikimedia.org/legal.html,not found,https://creativecommons.org/publicdomain/zero/...,not found,,
fields,"{'2016-train': ['...'], '2016-dev': ['...'], '...","{'THREAD_SEQUENCE': '', 'RelQuestion': {'RELQ_...",,{'question1': 'the full text of each question'...,"{'section_texts': 'list of body of sections', ...",,"{'crawled': 'date the story was archived', 'or...",{'topic': 'name of topic (20 variant of possib...,,
description,SemEval 2016 / 2017 Task 3 Subtask B and C dat...,SemEval 2016 / 2017 Task 3 Subtask A unannotat...,Patent Grant Full Text. Contains the full text...,"Over 400,000 lines of potential question dupli...",Extracted Wikipedia dump from October 2017. Pr...,"First 100,000,000 bytes of plain text from Wik...","News dataset, contains text and metadata from ...","The notorious collection of approximately 20,0...",[THIS IS ONLY FOR TESTING] Synopsis of the mov...,[THIS IS ONLY FOR TESTING] Synopsis of the mov...
checksum,701ea67acd82e75f95e1d8e62fb0ad29,2de0e2f2c4f91c66ae4fcf58d50ba816,,d7cfa7fbc6e2ec71ab74c495586c6365,,68799af40b6bda07dfa47a32612e5364,5e64e942df13219465927f92dcefd5fe,c92fd4f6640a86d5ba89eaad818a9891,1767ac93a089b43899d54944b07d9dc5,
file_name,semeval-2016-2017-task3-subtaskBC.gz,semeval-2016-2017-task3-subtaskA-unannotated.gz,patent-2017.gz,quora-duplicate-questions.gz,wiki-english-20171001.gz,text8.gz,fake-news.gz,20-newsgroups.gz,__testing_matrix-synopsis.gz,__testing_multipart-matrix-synopsis.gz
read_more,"[http://alt.qcri.org/semeval2017/task3/, http:...","[http://alt.qcri.org/semeval2016/task3/, http:...",[http://patents.reedtech.com/pgrbft.php],[https://data.quora.com/First-Quora-Dataset-Re...,[https://dumps.wikimedia.org/enwiki/20171001/],[http://mattmahoney.net/dc/textdata.html],[https://www.kaggle.com/mrisdal/fake-news],[http://qwone.com/~jason/20Newsgroups/],[http://www.imdb.com/title/tt0133093/plotsumma...,[http://www.imdb.com/title/tt0133093/plotsumma...


In [54]:
models = pd.DataFrame(api_info['models'])
models.loc['file_size'] //= 2**20
models

Unnamed: 0,fasttext-wiki-news-subwords-300,conceptnet-numberbatch-17-06-300,word2vec-ruscorpora-300,word2vec-google-news-300,glove-wiki-gigaword-50,glove-wiki-gigaword-100,glove-wiki-gigaword-200,glove-wiki-gigaword-300,glove-twitter-25,glove-twitter-50,glove-twitter-100,glove-twitter-200,__testing_word2vec-matrix-synopsis
num_records,999999,1917247,184973,3000000,400000,400000,400000,400000,1193514,1193514,1193514,1193514,
file_size,958,1168,198,1662,65,128,252,376,104,199,387,758,
base_dataset,"Wikipedia 2017, UMBC webbase corpus and statmt...","ConceptNet, word2vec, GloVe, and OpenSubtitles...",Russian National Corpus (about 250M words),Google News (about 100 billion words),"Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...",
reader_code,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,
license,https://creativecommons.org/licenses/by-sa/3.0/,https://github.com/commonsense/conceptnet-numb...,https://creativecommons.org/licenses/by/4.0/de...,not found,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,
parameters,{'dimension': 300},{'dimension': 300},"{'dimension': 300, 'window_size': 10}",{'dimension': 300},{'dimension': 50},{'dimension': 100},{'dimension': 200},{'dimension': 300},{'dimension': 25},{'dimension': 50},{'dimension': 100},{'dimension': 200},{'dimensions': 50}
description,1 million word vectors trained on Wikipedia 20...,ConceptNet Numberbatch consists of state-of-th...,Word2vec Continuous Skipgram vectors trained o...,Pre-trained vectors trained on a part of the G...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,"Pre-trained vectors based on 2B tweets, 27B to...","Pre-trained vectors based on 2B tweets, 27B to...","Pre-trained vectors based on 2B tweets, 27B t...","Pre-trained vectors based on 2B tweets, 27B to...",[THIS IS ONLY FOR TESTING] Word vecrors of the...
read_more,[https://fasttext.cc/docs/en/english-vectors.h...,[http://aaai.org/ocs/index.php/AAAI/AAAI17/pap...,[https://www.academia.edu/24306935/WebVectors_...,"[https://code.google.com/archive/p/word2vec/, ...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...",[]
checksum,de2bb3a20c46ce65c9c131e1ad9a77af,fd642d457adcd0ea94da0cd21b150847,9bdebdc8ae6d17d20839dd9b5af10bc4,a5e5354d40acb95f9ec66d5977d140ef,c289bc5d7f2f02c6dc9f2f9b67641813,40ec481866001177b8cd4cb0df92924f,59652db361b7a87ee73834a6c391dfc1,29e9329ac2241937d55b852e8284e89b,50db0211d7e7a2dcd362c6b774762793,c168f18641f8c8a00fe30984c4799b2b,b04f7bed38756d64cf55b58ce7e97b15,e52e8392d1860b95d5308a525817d8f9,534dcb8b56a360977a269b7bfc62d124
file_name,fasttext-wiki-news-subwords-300.gz,conceptnet-numberbatch-17-06-300.gz,word2vec-ruscorpora-300.gz,word2vec-google-news-300.gz,glove-wiki-gigaword-50.gz,glove-wiki-gigaword-100.gz,glove-wiki-gigaword-200.gz,glove-wiki-gigaword-300.gz,glove-twitter-25.gz,glove-twitter-50.gz,glove-twitter-100.gz,glove-twitter-200.gz,__testing_word2vec-matrix-synopsis.gz


In [49]:
api.info('word2vec-ruscorpora-300')

{'num_records': 184973,
 'file_size': 208427381,
 'base_dataset': 'Russian National Corpus (about 250M words)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-ruscorpora-300/__init__.py',
 'license': 'https://creativecommons.org/licenses/by/4.0/deed.en',
 'parameters': {'dimension': 300, 'window_size': 10},
 'description': 'Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (about 250M words). The model contains 185K words.',
 'preprocessing': 'The corpus was lemmatized and tagged with Universal PoS',
 'read_more': ['https://www.academia.edu/24306935/WebVectors_a_Toolkit_for_Building_Web_Interfaces_for_Vector_Semantic_Models',
  'http://rusvectores.org/en/',
  'https://github.com/RaRe-Technologies/gensim-data/issues/3'],
 'checksum': '9bdebdc8ae6d17d20839dd9b5af10bc4',
 'file_name': 'word2vec-ruscorpora-300.gz',
 'parts': 1}

In [47]:
pd.DataFrame(api.info('text8'))

Unnamed: 0,num_records,record_format,file_size,reader_code,license,description,checksum,file_name,read_more,parts
0,1701,list of str (tokens),33182058,https://github.com/RaRe-Technologies/gensim-da...,not found,"First 100,000,000 bytes of plain text from Wik...",68799af40b6bda07dfa47a32612e5364,text8.gz,http://mattmahoney.net/dc/textdata.html,1


In [50]:
dataset = api.load('text8')

data = [d for d in dataset]
print(data[0])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 'so

In [52]:
pd.DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,anarchism,originated,as,a,term,of,abuse,first,used,against,...,pointing,out,objects,of,interest,lack,of,social,or,emotional
1,reciprocity,qualitative,impairments,in,communication,as,manifested,by,at,least,...,armor,for,him,from,hephaestus,the,goddess,athena,provides,him
2,with,the,aegis,of,zeus,when,he,goes,to,the,...,from,the,balcony,to,the,stage,below,breaking,his,leg
3,despite,his,injury,booth,managed,to,limp,to,his,horse,...,short,film,best,sound,mixing,one,nine,three,zero,to
4,present,best,sound,editing,one,nine,six,three,to,present,...,london,independent,the,sociology,of,the,ayn,rand,cult,by
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696,the,format,although,there,are,several,third,party,tools,which,...,aggression,pact,was,signed,with,provisions,that,included,consultation,arbitration
1697,if,either,party,disagreed,neutrality,if,either,went,to,war,...,willie,mccovey,satchel,paige,and,ozzie,smith,new,york,city
1698,and,chicago,are,first,and,second,respectively,the,pop,band,...,two,eight,three,two,three,six,four,zero,four,four
1699,four,five,four,eight,four,nine,five,zero,five,two,...,medal,history,historical,myths,history,of,poland,here,you,can


In [55]:
w2v_model = api.load('glove-wiki-gigaword-50')



[('dog', 0.9218006134033203),
 ('rabbit', 0.8487821221351624),
 ('monkey', 0.8041081428527832),
 ('rat', 0.7891963124275208),
 ('cats', 0.7865270972251892),
 ('snake', 0.7798910737037659),
 ('dogs', 0.7795814871788025),
 ('pet', 0.7792249917984009),
 ('mouse', 0.773166835308075),
 ('bite', 0.7728800177574158)]

In [64]:
w2v_model.most_similar('stas')

[('misezhnikov', 0.8102508783340454),
 ('vasanti', 0.79458087682724),
 ('zuquilanda', 0.7905856370925903),
 ('zoheir', 0.7848931550979614),
 ('rabanus', 0.7687687873840332),
 ('salkey', 0.7685246467590332),
 ('therence', 0.7507745623588562),
 ('shaoguang', 0.7465018033981323),
 ('oby', 0.740770697593689),
 ('kaliopate', 0.7386929392814636)]

In [67]:
dataset = api.load('text8')
data = [d for d in dataset]

model = Word2Vec(data)

In [None]:
# model = Word2Vec()
# model.build_vocab(data)
# model.train(data, total_example=model.corpus_count, epochs=5)

In [69]:
model.save('w2v_newmodel')
model = Word2Vec.load('w2v_newmodel')

In [71]:
model.train([['hello', 'world']], total_examples=1, epochs=1)



(2, 2)

In [72]:
model.wv.most_similar('dog')

[('cat', 0.8520538806915283),
 ('pig', 0.7632667422294617),
 ('hound', 0.7564901113510132),
 ('hamster', 0.7503186464309692),
 ('goat', 0.7484349012374878),
 ('bee', 0.7371864914894104),
 ('pie', 0.735870897769928),
 ('cow', 0.7335312962532043),
 ('dogs', 0.7333595156669617),
 ('shit', 0.731782078742981)]

In [79]:
model.wv.most_similar(positive=['france', 'moscow'], negative=['paris'])

[('russia', 0.7841225266456604),
 ('yugoslavia', 0.7462077736854553),
 ('ussr', 0.738148033618927),
 ('bulgaria', 0.7331429719924927),
 ('libya', 0.732929527759552),
 ('finland', 0.7251707315444946),
 ('lithuania', 0.7179101705551147),
 ('afghanistan', 0.7103287577629089),
 ('belarus', 0.6983333230018616),
 ('chechnya', 0.6948888897895813)]

In [80]:
model.wv.similarity('good', 'bad')

0.75019616

In [83]:
models

Unnamed: 0,fasttext-wiki-news-subwords-300,conceptnet-numberbatch-17-06-300,word2vec-ruscorpora-300,word2vec-google-news-300,glove-wiki-gigaword-50,glove-wiki-gigaword-100,glove-wiki-gigaword-200,glove-wiki-gigaword-300,glove-twitter-25,glove-twitter-50,glove-twitter-100,glove-twitter-200,__testing_word2vec-matrix-synopsis
num_records,999999,1917247,184973,3000000,400000,400000,400000,400000,1193514,1193514,1193514,1193514,
file_size,958,1168,198,1662,65,128,252,376,104,199,387,758,
base_dataset,"Wikipedia 2017, UMBC webbase corpus and statmt...","ConceptNet, word2vec, GloVe, and OpenSubtitles...",Russian National Corpus (about 250M words),Google News (about 100 billion words),"Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...","Twitter (2B tweets, 27B tokens, 1.2M vocab, un...",
reader_code,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,https://github.com/RaRe-Technologies/gensim-da...,
license,https://creativecommons.org/licenses/by-sa/3.0/,https://github.com/commonsense/conceptnet-numb...,https://creativecommons.org/licenses/by/4.0/de...,not found,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,http://opendatacommons.org/licenses/pddl/,
parameters,{'dimension': 300},{'dimension': 300},"{'dimension': 300, 'window_size': 10}",{'dimension': 300},{'dimension': 50},{'dimension': 100},{'dimension': 200},{'dimension': 300},{'dimension': 25},{'dimension': 50},{'dimension': 100},{'dimension': 200},{'dimensions': 50}
description,1 million word vectors trained on Wikipedia 20...,ConceptNet Numberbatch consists of state-of-th...,Word2vec Continuous Skipgram vectors trained o...,Pre-trained vectors trained on a part of the G...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,Pre-trained vectors based on Wikipedia 2014 + ...,"Pre-trained vectors based on 2B tweets, 27B to...","Pre-trained vectors based on 2B tweets, 27B to...","Pre-trained vectors based on 2B tweets, 27B t...","Pre-trained vectors based on 2B tweets, 27B to...",[THIS IS ONLY FOR TESTING] Word vecrors of the...
read_more,[https://fasttext.cc/docs/en/english-vectors.h...,[http://aaai.org/ocs/index.php/AAAI/AAAI17/pap...,[https://www.academia.edu/24306935/WebVectors_...,"[https://code.google.com/archive/p/word2vec/, ...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...","[https://nlp.stanford.edu/projects/glove/, htt...",[]
checksum,de2bb3a20c46ce65c9c131e1ad9a77af,fd642d457adcd0ea94da0cd21b150847,9bdebdc8ae6d17d20839dd9b5af10bc4,a5e5354d40acb95f9ec66d5977d140ef,c289bc5d7f2f02c6dc9f2f9b67641813,40ec481866001177b8cd4cb0df92924f,59652db361b7a87ee73834a6c391dfc1,29e9329ac2241937d55b852e8284e89b,50db0211d7e7a2dcd362c6b774762793,c168f18641f8c8a00fe30984c4799b2b,b04f7bed38756d64cf55b58ce7e97b15,e52e8392d1860b95d5308a525817d8f9,534dcb8b56a360977a269b7bfc62d124
file_name,fasttext-wiki-news-subwords-300.gz,conceptnet-numberbatch-17-06-300.gz,word2vec-ruscorpora-300.gz,word2vec-google-news-300.gz,glove-wiki-gigaword-50.gz,glove-wiki-gigaword-100.gz,glove-wiki-gigaword-200.gz,glove-wiki-gigaword-300.gz,glove-twitter-25.gz,glove-twitter-50.gz,glove-twitter-100.gz,glove-twitter-200.gz,__testing_word2vec-matrix-synopsis.gz


In [84]:
wv = api.load('glove-twitter-50')



In [85]:
wv.similarity('good', 'done')

0.85031825

In [90]:
wv.most_similar(positive=['spain', 'moscow'], negative=['madrid'])

[('norway', 0.8594844937324524),
 ('switzerland', 0.8454592227935791),
 ('belgium', 0.8415361642837524),
 ('zealand', 0.8191095590591431),
 ('russia', 0.8179768323898315),
 ('belarus', 0.8132842779159546),
 ('warsaw', 0.806350827217102),
 ('quebec', 0.8019694685935974),
 ('vienna', 0.7962620854377747),
 ('greece', 0.7938109636306763)]

In [95]:
wv.doesnt_match(['nikita', 'misha', 'masha'])

'nikita'

In [109]:
vector = wv.get_mean_vector('man human people'.split(), pre_normalize=False)
vector

array([ 0.65847564,  0.13869533, -0.35067832, -0.01452665, -0.3201137 ,
        0.26344335,  0.6602067 ,  0.07860333, -0.5347433 ,  0.40224335,
       -0.12377   , -0.10232335, -4.7831    ,  0.13656999,  0.16603667,
        0.18253267,  0.14024998, -0.3064233 ,  0.40139666, -0.6204066 ,
       -0.12579334,  0.3041433 , -0.24207334,  0.17786734,  0.110939  ,
        0.49356666, -0.05830666, -0.10735935,  0.038982  , -0.17787667,
       -0.24652116, -0.18482666,  0.19507332,  0.20744   ,  0.5721333 ,
       -0.04638899, -0.3683667 , -0.08205633, -0.29708132, -0.7999167 ,
       -0.75856   ,  0.0756    ,  0.03677   , -0.03346666, -0.19781967,
       -0.05864567,  0.7420034 ,  0.73248005, -0.08846334, -0.03340134],
      dtype=float32)

In [108]:
wv['man human people'.split()].mean(axis=0)

array([ 0.65847564,  0.13869533, -0.35067832, -0.01452665, -0.3201137 ,
        0.26344335,  0.6602067 ,  0.07860333, -0.5347433 ,  0.40224335,
       -0.12377   , -0.10232335, -4.7831    ,  0.13656999,  0.16603667,
        0.18253267,  0.14024998, -0.3064233 ,  0.40139666, -0.6204066 ,
       -0.12579334,  0.3041433 , -0.24207334,  0.17786734,  0.110939  ,
        0.49356666, -0.05830666, -0.10735935,  0.038982  , -0.17787667,
       -0.24652116, -0.18482666,  0.19507332,  0.20744   ,  0.5721333 ,
       -0.04638899, -0.3683667 , -0.08205633, -0.29708132, -0.7999167 ,
       -0.75856   ,  0.0756    ,  0.03677   , -0.03346666, -0.19781967,
       -0.05864567,  0.7420034 ,  0.73248005, -0.08846334, -0.03340134],
      dtype=float32)

**Russian Word2Vec**

In [110]:
!wget http://vectors.nlpl.eu/repository/20/180.zip

--2024-03-02 13:14:30--  http://vectors.nlpl.eu/repository/20/180.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 484452317 (462M) [application/zip]
Saving to: ‘180.zip’


2024-03-02 13:14:59 (16.5 MB/s) - ‘180.zip’ saved [484452317/484452317]



In [111]:
!unzip 180.zip

Archive:  180.zip
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


In [112]:
model_ru = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [113]:
words = ['день_NOUN', 'ночь_NOUN', 'человек_NOUN', 'семантика_NOUN', 'биткоин_NOUN']

In [114]:
for word in words:
  if word in model_ru:
    print(word)
    print(model_ru[word][:10])
    for word, sim in model_ru.most_similar(positive=[word], topn=10):
      print(word, ': ', sim)
    print('\n')
  else:
    print('there is no word', word)

день_NOUN
[ 1.805067   -0.877623   -1.0102742   2.8518744  -0.43311968 -3.7207692
 -3.4317713  -0.7634762  -4.9961104  -1.1313324 ]
неделя_NOUN :  0.7375996112823486
день_PROPN :  0.706766664981842
месяц_NOUN :  0.7037326097488403
час_NOUN :  0.6643950939178467
утро_NOUN :  0.6526744961738586
вечер_NOUN :  0.6038411259651184
сутки_NOUN :  0.5923080444335938
воскресенье_NOUN :  0.5842781066894531
полдень_NOUN :  0.5743688344955444
суббота_NOUN :  0.5345946550369263


ночь_NOUN
[-0.10776415  0.32673436  0.52870405  2.1667976   0.7689093  -2.4214501
 -1.4222336  -2.972895    0.18769576 -0.05231643]
ночь_PROPN :  0.8310787081718445
вечер_NOUN :  0.7183678150177002
рассвет_NOUN :  0.6965947151184082
ночи_NOUN :  0.692021906375885
полночь_NOUN :  0.6704976558685303
ночь_VERB :  0.6615265011787415
утро_NOUN :  0.6263936161994934
ночной_ADJ :  0.6024709343910217
полдень_NOUN :  0.5835085511207581
сумерки_NOUN :  0.5671443939208984


человек_NOUN
[ 0.02881786 -0.7942778   2.4604542   2.2049303 