In [1]:
from sklearn import datasets

categories = ["soc.religion.christian", "sci.space", "talk.politics.mideast", "rec.sport.baseball"]
cat_dict = {} # Contains raw training data organized by category
cat_dict_test = {} # Contains raw test data organized by category
for cat in categories:
    cat_dict[cat] = datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=[cat]).data
    cat_dict_test[cat] = datasets.fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=[cat]).data

In [7]:
len(cat_dict['soc.religion.christian'])

599

In [3]:
import gensim

def tokenize(text, stopwords, max_len = 20):
    return [token for token in gensim.utils.simple_preprocess(text, max_len=max_len) if token not in stopwords]

cat_dict_tagged_train = {} # Contains clean tagged training data organized by category. To be used for the training corpus.
cat_dict_test_clean = {} # Contains clean un-tagged training data organized by category.

offset = 0 # Used for managing IDs of tagged documents
for k, v in cat_dict.items():
    cat_dict_tagged_train[k] = [gensim.models.doc2vec.TaggedDocument(tokenize(text, [], max_len=200), [i+offset]) for i, text in enumerate(v)]
    offset += len(v)

offset = 0
for k, v in cat_dict_test.items():
    cat_dict_test_clean[k] = [tokenize(text, [], max_len=200) for i, text in enumerate(v)]
    offset += len(v)
    
# Eventually contains final versions of the training data to actually train the model
train_corpus = [taggeddoc for taggeddoc_list in list(cat_dict_tagged_train.values()) for taggeddoc in taggeddoc_list]

In [10]:
train_corpus[0]

TaggedDocument(words=['wrote', 'in', 'response', 'to', 'dlecoint', 'garnet', 'acns', 'fsu', 'edu', 'darius_lecointe', 'was', 'paul', 'god', 'too', 'is', 'an', 'interpretation', 'of', 'the', 'words', 'of', 'paul', 'of', 'higher', 'priority', 'than', 'the', 'direct', 'word', 'of', 'jesus', 'in', 'matt', 'paul', 'begins', 'romans', 'with', 'if', 'someone', 'is', 'weak', 'in', 'the', 'faith', 'do', 'you', 'count', 'yourself', 'as', 'one', 'who', 'is', 'weak', 'in', 'the', 'faith', 'yes', 'but', 'what', 'does', 'the', 'bible', 'have', 'to', 'say', 'what', 'did', 'jesus', 'say', 'paul', 'closes', 'romans', 'with', 'on', 'the', 'other', 'hand', 'the', 'person', 'with', 'doubts', 'about', 'something', 'who', 'eats', 'it', 'anyway', 'is', 'guilty', 'because', 'he', 'isn', 'acting', 'on', 'his', 'faith', 'and', 'any', 'failure', 'to', 'act', 'on', 'faith', 'is', 'sin', 'gaus', 'isbn', 'have', 'you', 'read', 'the', 'ten', 'commandments', 'which', 'are', 'portion', 'of', 'the', 'law', 'have', 'you

In [7]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=40, window=2)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [9]:
metadata = {}
inferred_vectors_test = {} # Contains, category-wise, inferred doc vecs for each document in the test set
for cat, docs in cat_dict_test_clean.items():
    inferred_vectors_test[cat] = [model.infer_vector(doc) for doc in list(docs)]
    metadata[cat] = len(inferred_vectors_test[cat])

In [11]:
import csv

def write_to_csv(input, output_file, delimiter='\t'):
    with open(output_file, "w") as f:
        writer = csv.writer(f, delimiter=delimiter)
        writer.writerows(input)
        
veclist_metadata = []
veclist = []
for cat in cat_dict.keys():
    for tag in [cat]*metadata[cat]:
        veclist_metadata.append([tag])
    for vec in inferred_vectors_test[cat]:
        veclist.append(list(vec))
write_to_csv(veclist, "doc2vec_20Newsgroups_vectors.csv")
write_to_csv(veclist_metadata, "doc2vec_20Newsgroups_vectors_metadata.csv")

In [13]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Предположим, что у вас есть список текстов
texts = [
    "Текст 1",
    "Текст 2",
    "Текст 3",
    # добавьте ваши тексты здесь
]

# Подготовьте тексты в формате TaggedDocument
documents = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]

# Инициализируйте и обучите модель Doc2Vec
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# Теперь вы можете получить векторное представление для каждого документа
# Например, для документа с индексом 0
vector_for_doc_0 = model.docvecs['0']
print("Вектор для документа 0:", vector_for_doc_0)


Вектор для документа 0: [-5.2308156e-03 -5.9791268e-03 -9.8807542e-03  8.5528456e-03
  3.5661161e-03  2.6303172e-04 -9.8806275e-03 -5.1666484e-03
 -9.7179627e-03  2.0107795e-03  2.8303110e-03  4.6435557e-03
 -4.2972756e-03 -3.1457066e-03 -3.0787874e-03 -8.7219151e-03
  2.1724831e-03  9.2256228e-03 -9.5018670e-03 -3.4580862e-03
 -3.7699090e-03  2.6073826e-03 -5.6915567e-03  2.6206803e-03
  5.8025215e-03 -8.1068603e-03 -8.3297910e-03 -9.9546695e-03
  4.9330448e-03 -9.1223074e-03  5.8419635e-03  6.8002627e-03
 -6.5064002e-03 -4.5198812e-03 -1.2548614e-03  1.6463208e-03
 -1.4813376e-03 -8.5425414e-03 -3.6026132e-03  1.7316258e-03
 -2.0569193e-03 -7.2300420e-03  4.1846000e-03 -8.5743405e-03
  2.7115368e-03 -4.6137203e-03  6.4542773e-04 -2.0573472e-03
  5.4132282e-03 -8.0025708e-03 -2.1198511e-03 -9.5815660e-05
 -6.6387774e-03 -6.5261638e-03 -1.9329584e-03  8.8034747e-03
 -1.2631691e-03  3.5359799e-03 -5.7503129e-03  8.8148145e-03
  2.9154683e-03  9.2796851e-03  4.3498552e-03 -4.1995691e-03


  vector_for_doc_0 = model.docvecs['0']
