### Examples from lab

In [1]:
import gensim

assert gensim.models.doc2vec.FAST_VERSION > -1

In [2]:
import gensim
import gensim.test.utils

# Set file names for train and test data
lee_train_file = gensim.test.utils.datapath('lee_background.cor')
lee_test_file = gensim.test.utils.datapath('lee.cor')

In [3]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], [2]>
['the', 'united', 'states', 'government', 'has', 'said', 'it', 'wants', 'to', 'see', 'president', 'robert', 'mugabe', 'removed', 'from', 'power', 'and', 'that', 'it', 'is', 'working', 'with', 'the', 'zimbabwean', 'opposition', 'to', 'bring', 'about', 'change', 'of', 'administration', 'as', 'scores', 'of', 'white', 'farmers', 'went', 'into', 'hiding', 'to', 'escape', 'round', 'up', 'by', 'zimbabwean', 'police', 'senior', 'bush', 'administrat

In [4]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [5]:
model.build_vocab(train_corpus)

In [6]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [7]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x16dfa28b430>

In [10]:
vector = model.infer_vector(['Only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.14180753 -0.16909318 -0.2134914   0.13927513  0.04091355  0.02493528
  0.11452319  0.02688671 -0.26071814 -0.04064682  0.13835856  0.01400604
 -0.12504312 -0.0316937  -0.24284218 -0.14089742  0.0889559   0.10072058
  0.07568635 -0.05603003 -0.03883033 -0.04430138  0.30717254  0.06925573
  0.02079284 -0.10423418 -0.15538014 -0.16209278 -0.08465276 -0.05523092
  0.34479874 -0.10655043  0.2604831   0.08165478  0.13442086  0.11057539
 -0.12086464 -0.12361484 -0.04350721  0.03735721  0.11372878  0.02081445
  0.03745525 -0.12531903  0.15474778  0.11547173 -0.0996522  -0.18026035
  0.11184353  0.03305049]


In [11]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [12]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (23): «china said sunday it issued new regulations controlling the export of missile technology taking steps to ease concerns about transferring sensitive equipment to middle east countries particularly iran however the new rules apparently do not ban outright the transfer of specific items something washington long has urged beijing to do»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (234, 0.6025307178497314): «the foreign minister alexander downer has expressed concern about man who was arrested in india and has reportedly confessed to planning suicide attacks in australia the man was arrested month ago in india on suspicion of links to osama bin laden al qaeda network india home minister lk advani has been quoted by the reuters news agency as telling meeting of business and industry leaders in new delhi that the man has confessed to planning suicide attacks in australia and britain as well as on the indian parliament the report says m

### Task 0:
 Train your own doc2vec model on a test dataset. 

In [13]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp310-cp310-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.10.0-cp310-cp310-win_amd64.whl.metadata (5.1 kB)
Downloading fastparquet-2024.11.0-cp310-cp310-win_amd64.whl (670 kB)
   ---------------------------------------- 0.0/670.7 kB ? eta -:--:--
   --------------- ------------------------ 262.1/670.7 kB ? eta -:--:--
   ---------------------------------------- 670.7/670.7 kB 1.9 MB/s eta 0:00:00
Downloading cramjam-2.10.0-cp310-cp310-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 1.4 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 1.6 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.6 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.6 M


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import pandas as pd

df = pd.read_parquet('../ML-Arxiv-papers.parquet').sample(n=1000, random_state=42)
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,abstract
107644,107644,107644.0,SVD Perspectives for Augmenting DeepONet Flexi...,Deep operator networks (DeepONets) are power...
33964,33964,33964.0,Towards robust audio spoofing detection: a det...,"Automatic speaker verification, like every o..."
3137,3137,3137.0,Guided Random Forest in the RRF Package,Random Forest (RF) is a powerful supervised ...
33168,33168,33168.0,Best Arm Identification in Generalized Linear ...,"Motivated by drug design, we consider the be..."
20962,20962,20962.0,Conditional Affordance Learning for Driving in...,Most existing approaches to autonomous drivi...
...,...,...,...,...
96969,96969,96969.0,Empirical evaluation of shallow and deep learn...,This work presents a detailed comparison of ...
73593,73593,73593.0,Safe and Efficient Model-free Adaptive Control...,Adaptive control approaches yield high-perfo...
60738,60738,60738.0,Evaluating Community Detection Algorithms for ...,Many algorithms have been proposed in the la...
101781,101781,101781.0,On Almost Sure Convergence Rates of Stochastic...,The vast majority of convergence rates analy...


In [23]:
df["text"] = (df["title"].fillna('') + " " + df["abstract"].fillna('')).str.strip()


texts = df["text"].astype(str).tolist()
def preprocess_parquet(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


train_corpus = list(preprocess_parquet(texts))
test_corpus = list(preprocess_parquet(texts, tokens_only=True))  

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['guided', 'random', 'forest', 'in', 'the', 'rrf', 'package', 'random', 'forest', 'rf', 'is', 'powerful', 'supervised', 'learner', 'and', 'has', 'been', 'popularly', 'used', 'in', 'many', 'applications', 'such', 'as', 'bioinformatics', 'in', 'this', 'work', 'we', 'propose', 'the', 'guided', 'random', 'forest', 'grf', 'for', 'feature', 'selection', 'similar', 'to', 'feature', 'selection', 'method', 'called', 'guided', 'regularized', 'random', 'forest', 'grrf', 'grf', 'is', 'built', 'using', 'the', 'importance', 'scores', 'from', 'an', 'ordinary', 'rf', 'however', 'the', 'trees', 'in', 'grrf', 'are', 'built', 'sequentially', 'are', 'highly', 'correlated', 'and', 'do', 'not', 'allow', 'for', 'parallel', 'computing', 'while', 'the', 'trees', 'in', 'grf', 'are', 'built', 'independently', 'and', 'can', 'be', 'implemented', 'in', 'parallel', 'experiments', 'on', 'high', 'dimensional', 'gene', 'data', 'sets', 'show', 'that', 'with', 'fixed', 'parameter', 'value', 'without', 'tun

In [24]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [25]:
model.build_vocab(train_corpus)

In [26]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [27]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x16e88337a60>

### Task 1:
Assess validity of the model

In [29]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.16167223 -0.02172993  0.36377457 -0.2545349   0.3722068  -0.05069159
  0.11265036  0.24334888 -0.06294134 -0.0856095  -0.03224462 -0.37202698
  0.02632106 -0.25605708 -0.3394354   0.17249615  0.6947334   0.2674203
  0.36855903 -0.09802157 -0.1925795  -0.06018624 -0.20520961 -0.00561531
 -0.07535204 -0.20329171 -0.10838228  0.495497   -0.2315021  -0.11959247
 -0.01214309 -0.01285821  0.15958141 -0.10533137 -0.0127889  -0.61267513
  0.42985138 -0.08009687  0.03431808  0.22098233  0.44000176  0.2897394
 -0.40465808 -0.61617213  0.56965774 -0.28481513  0.15177488  0.10003507
  0.00301846  0.42797667]


In [30]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 1000})


In [31]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (963): «max margin deep generative models deep generative models dgms are effective on learning multilayered representations of complex data and performing inference of input data by exploring the generative ability however little work has been done on examining or empowering the discriminative ability of dgms on making accurate predictions this paper presents max margin deep generative models mmdgms which explore the strongly discriminative principle of max margin learning to improve the discriminative power of dgms while retaining the generative capability we develop an efficient doubly stochastic subgradient algorithm for the piecewise linear objective empirical results on mnist and svhn datasets demonstrate that max margin learning can significantly improve the prediction performance of dgms and meanwhile retain the generative ability and mmdgms are competitive to the state of the art fully discriminative networks by employing deep convolutional neural networks cnns a