In [112]:
import gensim

assert gensim.models.doc2vec.FAST_VERSION > -1

In [113]:
import gensim
import gensim.test.utils

# Set file names for train and test data
lee_train_file = gensim.test.utils.datapath('lee_background.cor')
lee_test_file = gensim.test.utils.datapath('lee.cor')

In [114]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], [2]>
['the', 'united', 'states', 'government', 'has', 'said', 'it', 'wants', 'to', 'see', 'president', 'robert', 'mugabe', 'removed', 'from', 'power', 'and', 'that', 'it', 'is', 'working', 'with', 'the', 'zimbabwean', 'opposition', 'to', 'bring', 'about', 'change', 'of', 'administration', 'as', 'scores', 'of', 'white', 'farmers', 'went', 'into', 'hiding', 'to', 'escape', 'round', 'up', 'by', 'zimbabwean', 'police', 'senior', 'bush', 'administrat

In [115]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [116]:
model.build_vocab(train_corpus)

In [117]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [118]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x19c4f173f20>

In [119]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.22893533 -0.20554085 -0.13129798  0.22304149 -0.11186235 -0.09403428
 -0.00075656  0.03464491 -0.25888056 -0.10293125  0.196114   -0.00196223
  0.03390473 -0.11785848 -0.04237121 -0.2999309   0.04176273  0.26153782
  0.19902906 -0.13180555  0.0401841  -0.00735625  0.11372375  0.02609809
 -0.00908577 -0.03928182 -0.24520338 -0.01027686 -0.23355114 -0.0358557
  0.36941603 -0.07787085  0.19498177  0.11011455  0.1256837   0.09161688
 -0.04098217 -0.20442925 -0.06043252  0.0499398  -0.00217425 -0.02434188
  0.01746238 -0.06466892  0.0871626   0.07535709 -0.04890525 -0.02570827
  0.15973866 -0.02477881]


In [120]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [121]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (30): «police are combing through videotapes trying to spot the gunman dressed in black who shot year old man to death at downtown massage parlour the victim was hit in the stomach and upper body and died about hours later in hospital the woman was not hurt police urged business owners to turn over any security camera videotapes they might have that recorded people on the street at the time several such videos are now being reviewed»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (198, 0.7682719230651855): «authorities are trying to track down the crew of vessel that landed undetected at cocos islands carrying asylum seekers the group of sri lankan men was found aboard their boat moored to the south of the islands yesterday afternoon shire president ron grant says investigations are underway as to the whereabouts of the crew after the asylum seekers told authorities they had left in another boat after dropping them off unfortunately for th

## EXERCISES

### Task 0. Train your own doc2vec model on a test dataset. Most of the example files use Parquet file format. A short guide below.

In [122]:
import pandas as pd

df =  pd.read_parquet("train-DataEntity_chunk_1.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21689 entries, 0 to 21688
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              21689 non-null  object
 1   label             21689 non-null  object
 2   tweet_hashtags    21689 non-null  object
 3   datetime          21689 non-null  object
 4   username_encoded  21689 non-null  object
 5   url_encoded       21689 non-null  object
dtypes: object(6)
memory usage: 1016.8+ KB


#### Preprocessing

In [123]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
import pandas as pd

# Вибираємо тільки тексти — не більше 10000 записів для тесту
texts = df['text'].dropna().astype(str).sample(10000, random_state=42)

# Створюємо TaggedDocuments
tagged_data = [
    TaggedDocument(words=simple_preprocess(text), tags=[str(i)]) 
    for i, text in enumerate(texts)
]


#### Train

In [124]:
# from gensim.models import Doc2Vec
# from tqdm import tqdm

# model = Doc2Vec(
#     vector_size=100,
#     alpha=0.025,
#     min_alpha=0.00025,
#     min_count=2,
#     dm=1,
#     epochs=20,
#     workers=4
# )

# model.build_vocab(tagged_data)

# for epoch in tqdm(range(20)):
#     model.train(tagged_data, total_examples=model.corpus_count, epochs=1)
#     model.alpha -= 0.001
#     model.min_alpha = model.alpha


In [125]:
from gensim.models import Doc2Vec
from tqdm import tqdm
import random

# Initialize the model with improved parameters
model = Doc2Vec(
    vector_size=150,  # Increased vector size
    alpha=0.025,
    min_alpha=0.0001,
    min_count=2,
    dm=1,  # Distributed Memory
    negative=5,  # Negative sampling
    workers=4
)

# Build vocabulary
model.build_vocab(tagged_data)

# Train the model with dynamic alpha adjustment and shuffled data
for epoch in tqdm(range(50)):  # Increased epochs
    print(f"Epoch {epoch + 1}")
    random.shuffle(tagged_data)  # Shuffle data
    model.train(tagged_data, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.0005  # Decrease alpha
    model.min_alpha = model.alpha  # Update min_alpha


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1


  2%|▏         | 1/50 [00:00<00:32,  1.50it/s]

Epoch 2


  4%|▍         | 2/50 [00:01<00:30,  1.58it/s]

Epoch 3


  6%|▌         | 3/50 [00:01<00:30,  1.52it/s]

Epoch 4


  8%|▊         | 4/50 [00:02<00:30,  1.49it/s]

Epoch 5


 10%|█         | 5/50 [00:03<00:33,  1.36it/s]

Epoch 6


 12%|█▏        | 6/50 [00:04<00:32,  1.34it/s]

Epoch 7


 14%|█▍        | 7/50 [00:05<00:33,  1.28it/s]

Epoch 8


 16%|█▌        | 8/50 [00:05<00:33,  1.27it/s]

Epoch 9


 18%|█▊        | 9/50 [00:06<00:30,  1.33it/s]

Epoch 10


 20%|██        | 10/50 [00:07<00:28,  1.39it/s]

Epoch 11


 22%|██▏       | 11/50 [00:07<00:26,  1.47it/s]

Epoch 12


 24%|██▍       | 12/50 [00:08<00:25,  1.51it/s]

Epoch 13


 26%|██▌       | 13/50 [00:09<00:24,  1.53it/s]

Epoch 14


 28%|██▊       | 14/50 [00:09<00:22,  1.58it/s]

Epoch 15


 30%|███       | 15/50 [00:10<00:21,  1.62it/s]

Epoch 16


 32%|███▏      | 16/50 [00:10<00:20,  1.65it/s]

Epoch 17


 34%|███▍      | 17/50 [00:11<00:19,  1.65it/s]

Epoch 18


 36%|███▌      | 18/50 [00:12<00:19,  1.64it/s]

Epoch 19


 38%|███▊      | 19/50 [00:12<00:19,  1.63it/s]

Epoch 20


 40%|████      | 20/50 [00:13<00:18,  1.65it/s]

Epoch 21


 42%|████▏     | 21/50 [00:13<00:17,  1.67it/s]

Epoch 22


 44%|████▍     | 22/50 [00:14<00:16,  1.67it/s]

Epoch 23


 46%|████▌     | 23/50 [00:15<00:16,  1.68it/s]

Epoch 24


 48%|████▊     | 24/50 [00:15<00:15,  1.68it/s]

Epoch 25


 50%|█████     | 25/50 [00:16<00:14,  1.68it/s]

Epoch 26


 52%|█████▏    | 26/50 [00:16<00:13,  1.72it/s]

Epoch 27


 54%|█████▍    | 27/50 [00:17<00:13,  1.72it/s]

Epoch 28


 56%|█████▌    | 28/50 [00:17<00:12,  1.74it/s]

Epoch 29


 58%|█████▊    | 29/50 [00:18<00:12,  1.72it/s]

Epoch 30


 60%|██████    | 30/50 [00:19<00:11,  1.75it/s]

Epoch 31


 62%|██████▏   | 31/50 [00:19<00:10,  1.79it/s]

Epoch 32


 64%|██████▍   | 32/50 [00:20<00:10,  1.78it/s]

Epoch 33


 66%|██████▌   | 33/50 [00:20<00:09,  1.76it/s]

Epoch 34


 68%|██████▊   | 34/50 [00:21<00:09,  1.72it/s]

Epoch 35


 70%|███████   | 35/50 [00:21<00:08,  1.76it/s]

Epoch 36


 72%|███████▏  | 36/50 [00:22<00:08,  1.73it/s]

Epoch 37


 74%|███████▍  | 37/50 [00:23<00:07,  1.72it/s]

Epoch 38


 76%|███████▌  | 38/50 [00:23<00:06,  1.76it/s]

Epoch 39


 78%|███████▊  | 39/50 [00:24<00:06,  1.72it/s]

Epoch 40


 80%|████████  | 40/50 [00:24<00:05,  1.74it/s]

Epoch 41


 82%|████████▏ | 41/50 [00:25<00:05,  1.68it/s]

Epoch 42


 84%|████████▍ | 42/50 [00:26<00:04,  1.66it/s]

Epoch 43


 86%|████████▌ | 43/50 [00:26<00:04,  1.63it/s]

Epoch 44


 88%|████████▊ | 44/50 [00:27<00:03,  1.62it/s]

Epoch 45


 90%|█████████ | 45/50 [00:27<00:03,  1.60it/s]

Epoch 46


 92%|█████████▏| 46/50 [00:28<00:02,  1.56it/s]

Epoch 47


 94%|█████████▍| 47/50 [00:29<00:01,  1.58it/s]

Epoch 48


 96%|█████████▌| 48/50 [00:29<00:01,  1.59it/s]

Epoch 49


 98%|█████████▊| 49/50 [00:30<00:00,  1.58it/s]

Epoch 50


100%|██████████| 50/50 [00:31<00:00,  1.61it/s]


##### TEST

In [126]:
test_text = "bitcoin will rise again"
vector = model.infer_vector(simple_preprocess(test_text))
print(vector)


[-2.6128641e-03  2.1444603e-03 -7.9049566e-04  3.2517393e-04
  6.9712085e-04  9.2040934e-04 -9.4193219e-06 -1.5700740e-03
 -2.2711023e-03  4.7641515e-04  7.9885044e-04 -2.4548757e-03
 -8.5811061e-04 -2.3597665e-03 -2.0153746e-03  1.1516976e-03
 -2.2422867e-03 -2.2186693e-03  1.7216094e-03  2.7411906e-03
 -3.0355859e-03 -1.3933218e-03 -1.9817539e-03 -1.1383190e-03
  2.4433406e-03  1.4289221e-03 -1.5106569e-03  4.5542835e-04
  5.3954718e-04  1.0516075e-03 -1.5615106e-03 -4.8427124e-04
 -3.9304851e-04 -1.3317253e-03  1.8481044e-03 -1.3050179e-04
 -2.2933902e-03  1.7656616e-03 -1.2762546e-05 -3.1037231e-03
 -2.6975470e-03 -2.7352175e-05 -2.2657705e-03 -1.7478792e-03
 -1.7523193e-03  1.4055959e-03 -7.3113106e-04 -1.5913765e-03
 -3.0923299e-03  1.8568508e-03 -1.3822758e-03 -1.8616552e-03
 -1.5303642e-03 -2.2710608e-03 -2.5493912e-03  2.4251144e-06
  3.0446993e-03 -2.9113453e-03 -8.6582761e-04 -3.3022314e-03
  4.2387008e-04  2.1469120e-03 -3.1847300e-03 -5.8678130e-04
 -1.6413848e-03 -2.67281

In [127]:
model.save("my_doc2vec.model")


In [139]:
model = Doc2Vec.load("my_doc2vec.model")

### Task 1. Practice finding similar documents/articles/posts. Assess validity of the model.

In [140]:
# Analyze rank distribution
print("Rank Distribution:", counter)

# Pick a random test document
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=5)

print(f"Test Document ({doc_id}): «{' '.join(test_corpus[doc_id])}»\n")

# Display most similar documents
print("Most Similar Documents:")
for sim_id, similarity in sims:
    try:
        sim_id = int(sim_id)  # Convert document ID to integer
        if 0 <= sim_id < len(train_corpus):  
            print(f"Document {sim_id} (Similarity: {similarity:.4f}): «{' '.join(train_corpus[sim_id].words)}»\n")
        else:
            print(f"Document {sim_id} is out of bounds for train_corpus.\n")
    except (ValueError, IndexError) as e:
        print(f"Error processing document {sim_id}: {e}\n")

Rank Distribution: Counter({0: 292, 1: 8})
Test Document (32): «at least three democrats are considering splitting from the party while no one has yet nominated to contest the leadership three of the gang of four senators who ousted natasha stott despoja from the leadership are considering forming new progressive centre party in the fallout from last week turmoil this would leave the democrats with rump of three or four members west australian senator andrew murray said yesterday unless the democrats left wing gave ground the party would split»

Most Similar Documents:
Document 9736 is out of bounds for train_corpus.

Document 3902 is out of bounds for train_corpus.

Document 4467 is out of bounds for train_corpus.

Document 2609 is out of bounds for train_corpus.

Document 3229 is out of bounds for train_corpus.



# Variant 2

In [145]:
query = "Solana is a great blockchain for NFTs"
query_vector = model.infer_vector(simple_preprocess(query))


In [147]:
# Define a query to find similar documents
query = "Bitcoin is the future of money"
query_vector = model.infer_vector(simple_preprocess(query))

# Find the top-5 most similar documents
similar_docs = model.dv.most_similar([query_vector], topn=10)

# Display the texts of the similar documents
print("Query:", query)
print("\nTop-5 Similar Documents:")
for doc_id, similarity in similar_docs:
    print(f"\n--- Similarity: {similarity:.4f} ---")
    print(texts.iloc[int(doc_id)])


Query: Bitcoin is the future of money

Top-5 Similar Documents:

--- Similarity: 0.2872 ---
The Ten Steps to Metaverse Interoperability. 
Step Six: Business Model Alignment
@KZeroWorldswide #metaverse #Interoperability #Blockchain #Web3 #smartcontracts
#defi #fintech @nicmitham

--- Similarity: 0.2859 ---
If you could integrate the latest version of GPT-4 or any future iterations into Worldcoin, I believe it would greatly enhance the value of both, potentially advancing the AI world even further.

#saga2056 #saga #worldcoin #ai #Gamefi #openai #googleai #samaltman #NFT #eth #btc

--- Similarity: 0.2763 ---
Is the crypto bull run over? What comes next?📉

Imagine a world where even drastic crypto dips boost your blazing confidence.

Transform uncertainty into security with inSure DeFi—the world's first decentralized #crypto insurance platform.

#rwa #ai $sure #eth #insure #btc #defi

--- Similarity: 0.2608 ---
I want to analyze $DNX from a fundamentals perspective. Let’s dive in 🤿

$SOL 

In [144]:
from gensim.utils import simple_preprocess

# Токенізуємо запит з тією ж процедурою, що й для текстів
def preprocess_query(query):
    return simple_preprocess(query)

# Оцінка схожих документів
def find_similar_documents(query, model, texts, top_n=5):
    # Отримуємо вектор запиту
    query_vector = model.infer_vector(preprocess_query(query))
    
    # Знаходимо top_n схожих документів
    similar_docs = model.dv.most_similar([query_vector], topn=top_n)
    
    results = []
    for doc_id, similarity in similar_docs:
        # Отримуємо текст документа
        document_text = texts.iloc[int(doc_id)]
        results.append((similarity, document_text))
    
    return results

# Тестовий запит
query = "blockchain and cryptocurrency"

# Знаходимо схожі документи
similar_docs = find_similar_documents(query, model, texts)

# Виводимо результати
print("Query:", query)
print("\nTop-5 Similar Documents:")
for similarity, doc in similar_docs:
    print(f"\n--- Similarity: {similarity:.4f} ---")
    print(doc)


Query: blockchain and cryptocurrency

Top-5 Similar Documents:

--- Similarity: 0.2747 ---
Microstrategy Is Seeking a Full-Time Lightning Network Engineer to Build a SaaS Platform
.
See More:.
#bitcoin #bitcoins #bitcoinprice #bitcoinnews #bitcoinmining #BitcoinBillionaire #bitcoincash #bitcointrading

--- Similarity: 0.2248 ---
At the D.C. Blockchain Summit, Hester Peirce called out the SECs enforcement-first approach, signaling cryptos growing influence in Washington. #Crypto #Blockchain

--- Similarity: 0.2083 ---
There is a significant increase in whale activity in BTC — data from Santiment

#Crypto #CryptoNews

#Bitcoin #BTC #BitcoinETF

#BTCHalving #BTCHalving2024

--- Similarity: 0.2055 ---
Core Scientific, a major Bitcoin mining company in the United States, is shifting focus to artificial intelligence (AI) to address challenges from the latest Bitcoin halving.  Read more on#web30  #blockchain #news #crypto #cryptonews #defi #nft #bitcoin

--- Similarity: 0.1940 ---
Injective i