In [None]:
import pandas as pd
data = pd.read_excel('reuters.xlsx')
X, y = data.text, data.topic

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

lem_texts = []
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for t in X:
    lem = [wnl.lemmatize(word) for word in str(t).split()]
    lem_texts.append(' '.join(lem))

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, min_df=10, max_df=0.8,
                     stop_words=nltk.corpus.stopwords.words('english'))
#max_features - к-сть слів, які використовуються для класифікації
#min_df - мін к-сть текстів, у яких міститься слово
#max_df - макс відсоток файлів, у яких міститься слово
#stop_words - шумові слова
X = cv.fit_transform(lem_texts).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

print(f'Topics: {len(set(y))}, X_shape: {X.shape}')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Topics: 82, X_shape: (10717, 2000)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import sklearn.naive_bayes
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score)

models = [
          sklearn.naive_bayes.CategoricalNB(),
          sklearn.naive_bayes.BernoulliNB(),
          sklearn.naive_bayes.ComplementNB(),
          sklearn.naive_bayes.GaussianNB(),
          sklearn.naive_bayes.MultinomialNB(),
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f'\nModel: \t{model}')
    #print(confusion_matrix(y_test,y_pred))
    #print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))


Model: 	CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)
0.37406716417910446

Model: 	BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.7416044776119403

Model: 	ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
0.8190298507462687

Model: 	GaussianNB(priors=None, var_smoothing=1e-09)
0.6427238805970149

Model: 	MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.7779850746268657


In [None]:
import sklearn.linear_model
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score)

models = [
          sklearn.linear_model.LogisticRegression(),
          sklearn.linear_model.PassiveAggressiveClassifier(),
          sklearn.linear_model.Perceptron(),
          sklearn.linear_model.RidgeClassifier(),
          sklearn.linear_model.SGDClassifier(),
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f'\nModel: \t{model}')
    #print(confusion_matrix(y_test,y_pred))
    #print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))


Model: 	LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8586753731343284

Model: 	PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)
0.8638059701492538

Model: 	Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, sh

In [None]:
import sklearn.svm
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score)

models = [
          sklearn.svm.LinearSVC(),
          sklearn.svm.SVC(),
          sklearn.svm.OneClassSVM(),
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f'\nModel: \t{model}')
    #print(confusion_matrix(y_test,y_pred))
    #print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))


Model: 	LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.8857276119402985

Model: 	SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.8642723880597015

Model: 	OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
            max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)
0.0


In [None]:
import sklearn.multiclass
import sklearn.multioutput
import sklearn.neighbors
import sklearn.neural_network
import sklearn.semi_supervised
import sklearn.tree
import sklearn.dummy
import sklearn.ensemble

from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score)

models = [
          #sklearn.multiclass.OneVsRestClassifier(),
          #sklearn.multiclass.OneVsOneClassifier(),
          #sklearn.multiclass.OutputCodeClassifier(),
          #sklearn.multioutput.ClassifierChain(),
          #sklearn.multioutput.MultiOutputClassifier(),
          sklearn.neighbors.KNeighborsClassifier(),
          #sklearn.neighbors.RadiusNeighborsClassifier(),
          sklearn.neighbors.NearestCentroid(),
          sklearn.neural_network.MLPClassifier(),
          sklearn.tree.DecisionTreeClassifier(),
          sklearn.tree.ExtraTreeClassifier(),
          sklearn.dummy.DummyClassifier(),
          sklearn.ensemble.AdaBoostClassifier(),
          sklearn.ensemble.BaggingClassifier(),
          sklearn.ensemble.ExtraTreesClassifier(),
          sklearn.ensemble.GradientBoostingClassifier(),
          sklearn.ensemble.RandomForestClassifier(),
          #sklearn.ensemble.StackingClassifier(),
          #sklearn.ensemble.VotingClassifier(),
          #sklearn.ensemble.HistGradientBoostingClassifier(),
          #sklearn.semi_supervised.SelfTrainingClassifier(),
          sklearn.semi_supervised.LabelPropagation(),
]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f'\nModel: \t{model}')
    #print(confusion_matrix(y_test,y_pred))
    #print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))


Model: 	KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.8339552238805971

Model: 	NearestCentroid(metric='euclidean', shrink_threshold=None)
0.7467350746268657

Model: 	MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
0.8703358208955224

Model: 	DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, ma




Model: 	DummyClassifier(constant=None, random_state=None, strategy='warn')
0.18703358208955223

Model: 	AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
0.4766791044776119

Model: 	BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)
0.8283582089552238

Model: 	ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs