In [1]:
from src.parsers import NewsgroupsParser, ReutersParser
from src.engines.doc2vec import Doc2VecModel

NEWSGROUPS = NewsgroupsParser()
REUTERS = ReutersParser()

In [2]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train(model: Doc2VecModel):
    # get the unique labels
    labels = []
    for entry in model.dataset.entries:
        for label in entry.labels:
            if label not in labels:
                labels.append(label)
    print(*enumerate(labels))

    # get train and test sets
    X = np.array([vector for vector in map(
        lambda e: model.model.dv[e.id],
        model.dataset.entries)])
    y = np.array([
        y_labs for y_labs in map(
        lambda e: np.array(np.array(
            [int(label in e.labels) for label in labels])),
        model.dataset.entries)]
    )
    # print(X)
    print(y, y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # train classifier
    clf = OneVsRestClassifier(SVC(kernel="poly", decision_function_shape="ovo"))
    clf.fit(X_train, y_train)

    # print report of training
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, zero_division=0)

    return y_pred, y_test, report


In [3]:
news_model = Doc2VecModel(NEWSGROUPS, use_predictor=True)
# train(news_model)

In [4]:
reuters_model = Doc2VecModel(REUTERS, use_predictor=True)
# y_pred, y_test, report = train(reuters_model)
# print(report)

In [5]:
from gensim.utils import simple_preprocess



In [34]:
INDEX = 123
example = REUTERS.entries[INDEX].main_content
labels = REUTERS.entries[INDEX].labels
vector = reuters_model.model.infer_vector(simple_preprocess(example))

print(reuters_model.predict_labels(''''''))
print(labels)
example


[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['crude', 'acq', 'earn', 'pet-chem']
['acq', 'crude', 'earn', 'pet-chem']


'ENERGY/U.S. PETROCHEMICAL INDUSTRY\n  Cheap oil feedstocks, the weakened U.S.\n  dollar and a plant utilization rate approaching 90 pct will\n  propel the streamlined U.S. petrochemical industry to record\n  profits this year, with growth expected through at least 1990,\n  major company executives predicted.\n      This bullish outlook for chemical manufacturing and an\n  industrywide move to shed unrelated businesses has prompted GAF\n  Corp &lt;GAF>, privately-held Cain Chemical Inc, and other firms\n  to aggressively seek acquisitions of petrochemical plants.\n      Oil companies such as Ashland Oil Inc &lt;ASH>, the\n  Kentucky-based oil refiner and marketer, are also shopping for\n  money-making petrochemical businesses to buy.\n      "I see us poised at the threshold of a golden period," said\n  Paul Oreffice, chairman of giant Dow Chemical Co &lt;DOW>, adding,\n  "There\'s no major plant capacity being added around the world\n  now. The whole game is bringing out new products a