In [1]:
import gensim

assert gensim.models.doc2vec.FAST_VERSION > -1

In [27]:
import pandas as pd

df = pd.read_parquet("ML-Arxiv-Papers.parquet")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,0.0,Learning from compressed observations,The problem of statistical learning is to co...
1,1,1.0,Sensor Networks with Random Links: Topology De...,"In a sensor network, in practice, the commun..."
2,2,2.0,The on-line shortest path problem under partia...,The on-line shortest path problem is conside...
3,3,3.0,A neural network approach to ordinal regression,Ordinal regression is an important type of l...
4,4,4.0,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close r...
...,...,...,...,...
117587,4995,,Detecting COVID-19 Conspiracy Theories with Tr...,The sharing of fake news and conspiracy theori...
117588,4996,,Fair Feature Subset Selection using Multiobjec...,The feature subset selection problem aims at s...
117589,4997,,A Simple Duality Proof for Wasserstein Distrib...,We present a short and elementary proof of the...
117590,4998,,Combined Learning of Neural Network Weights fo...,"We introduce CoLN, Combined Learning of Neural..."


In [None]:
df["text"] = (df["title"].fillna('') + " " + df["abstract"].fillna('')).str.strip()


texts = df["text"].astype(str).tolist()
def preprocess_parquet(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


train_corpus = list(preprocess_parquet(texts))
test_corpus = list(preprocess_parquet(texts, tokens_only=True))  

print(train_corpus[2])
print(test_corpus[2])


In [31]:

df = pd.read_parquet("ML-Arxiv-Papers.parquet").sample(n=1000, random_state=42)


df["text"] = (df["title"].fillna('') + " " + df["abstract"].fillna('')).str.strip()

texts = df["text"].astype(str).tolist()


def preprocess_parquet(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(preprocess_parquet(texts))
test_corpus = list(preprocess_parquet(texts, tokens_only=True))

print("Sample train doc:", train_corpus[2])
print("Sample test doc:", test_corpus[2])


Sample train doc: TaggedDocument(['guided', 'random', 'forest', 'in', 'the', 'rrf', 'package', 'random', 'forest', 'rf', 'is', 'powerful', 'supervised', 'learner', 'and', 'has', 'been', 'popularly', 'used', 'in', 'many', 'applications', 'such', 'as', 'bioinformatics', 'in', 'this', 'work', 'we', 'propose', 'the', 'guided', 'random', 'forest', 'grf', 'for', 'feature', 'selection', 'similar', 'to', 'feature', 'selection', 'method', 'called', 'guided', 'regularized', 'random', 'forest', 'grrf', 'grf', 'is', 'built', 'using', 'the', 'importance', 'scores', 'from', 'an', 'ordinary', 'rf', 'however', 'the', 'trees', 'in', 'grrf', 'are', 'built', 'sequentially', 'are', 'highly', 'correlated', 'and', 'do', 'not', 'allow', 'for', 'parallel', 'computing', 'while', 'the', 'trees', 'in', 'grf', 'are', 'built', 'independently', 'and', 'can', 'be', 'implemented', 'in', 'parallel', 'experiments', 'on', 'high', 'dimensional', 'gene', 'data', 'sets', 'show', 'that', 'with', 'fixed', 'parameter', 'value

In [32]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [33]:
model.build_vocab(train_corpus)

In [34]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [35]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x1372fdb3d90>

In [36]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[ 0.08790659 -0.15608478  0.04763642 -0.02384534  0.44756308 -0.15677297
  0.18224443  0.38171116 -0.11254364  0.20638946  0.00474802 -0.35088018
  0.08199678 -0.12789318 -0.36796916 -0.14833704  0.5732824   0.12079011
  0.4978068  -0.05393522 -0.16476229 -0.05717217 -0.08952983  0.03479337
 -0.12052672 -0.51582813 -0.03802516  0.24639948 -0.17839184 -0.189645
  0.07991289 -0.05492688  0.30556113 -0.1889699   0.17209831 -0.51748013
  0.3253089   0.06524195 -0.04967424  0.28368092  0.47358108  0.13111
 -0.37023544 -0.52133363  0.5354818  -0.20109093  0.36223802  0.24787362
  0.02772081  0.22813538]


In [37]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 1000})


In [38]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (927): «zero shot learning and knowledge transfer in music classification and tagging music classification and tagging is conducted through categorical supervised learning with fixed set of labels in principle this cannot make predictions on unseen labels zero shot learning is an approach to solve the problem by using side information about the semantic labels we recently investigated this concept of zero shot learning in music classification and tagging task by projecting both audio and label space on single semantic space in this work we extend the work to verify the generalization ability of zero shot learning model by conducting knowledge transfer to different music corpora»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (927, 0.9441136717796326): «zero shot learning and knowledge transfer in music classification and tagging music classification and tagging is conducted through categorical supervised learning with fixed set of labels i

In [11]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp39-cp39-win_amd64.whl (671 kB)
     -------------------------------------- 671.2/671.2 kB 3.3 MB/s eta 0:00:00
Collecting cramjam>=2.3
  Downloading cramjam-2.9.1-cp39-cp39-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 5.6 MB/s eta 0:00:00
Collecting pandas>=1.5.0
  Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
     ---------------------------------------- 11.6/11.6 MB 2.3 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     -------------------------------------- 347.8/347.8 kB 2.4 MB/s eta 0:00:00
Installing collected packages: tzdata, cramjam, pandas, fastparquet
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4
    Uninstalling pandas-1.4.4:
      Successfully uninstalled pandas-1.4.4
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\user\\anaconda3\\Lib\\site-packages\\~andas\\_libs\\algos.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [15]:
import pandas as pd

file = pd.read_csv("hf://datasets/CShorten/ML-ArXiv-Papers/ML-Arxiv-Papers.csv")


In [19]:
file.to_parquet("ML-Arxiv-Papers.parquet", engine="pyarrow", index=False)