# Textklassifikation mit Vektorisierung in scikit-learn und MLPClassifier 
Labels (Themen und Disziplinen) sind auf höchste Hierarchieebene reduziert (reduced_labels)

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import csv
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neural_network import MLPClassifier # MultiLayerPerceptron classifier 
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/reduced_labels_trainset.csv' 
testset = '../Datasets/reduced_labels_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

archivalia_575.txt
['pluridisciplinarité_d', 'épistémologie et méthodes_t', 'histoire_t', 'histoire et archéologie_d']
diese titelformulierung der hab in ihrer handschriftendatenbank ist besonders sinnreich http diglib hab de db mss list ms id aug f mit digitalisat ich möchte nicht wissen wieviele forscher sich auf die fehlanzeige des verlinkten opacs der grundsätzlich nichts zu den handschriften ausspuckt während die ältere dokumentation funktioniert verlassen und so unnötig rechercheaufwand betreiben müssen kürzt man die signatur findet man den hinweis auf die münchner schedel ausstellung welt des wissens bevor ich das tat erkannte ich hartmann schedels ziemlich unverwechselbare schriftzüge und bemerkte bei einem blick auf die münchner digitalisate den typischen signaturzettel auf dem titel das bemerkenswerte familienbuch der nürnberger familie grabner erscheint in der liste von schedels büchern bei stauber https archive org stream dieschedelscheb hartgoog page n mode up die handschr

k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])


[0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

35
0 administration publique et développement_d
1 anthropologie_t
2 arts et humanités_d
3 asie_t
4 bibliothéconomie_d
5 droit_t
6 ethnologie_t
7 europe_t
8 géographie_t
9 histoire et archéologie_d
10 histoire_t
11 information_t
12 langage_t
13 langue et linguistique_d
14 littérature_d
15 moyen âge_t
16 pensée_t
17 pluridisciplinarité_d
18 psychisme_t
19 psychologie_d
20 religions_t
21 représentations_t
22 sciences de l'information et de la communication_d
23 sciences de la santé et de la santé publique_d
24 sciences politiques_d
25 sociologie et anthropologie_d
26 sociologie_t
27 travail social et politique sociale_d
28 éducation_d
29 éducation_t
30 épistémologie et méthodes_t
31 époque contemporaine_t
32 époque moderne_t
33 études des sciences_t
34 études du politique_t


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [6]:
max_features = 10000
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=max_features, stop_words=stopwords)
tfidf_transformer = TfidfTransformer(use_idf=True)

In [7]:
# first try with best params for vect and tfidf from kNN classification
"""text_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert
                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                    ])"""

"text_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert\n                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),\n                     ('tfidf', TfidfTransformer(use_idf=True)),\n                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),\n                    ])"

In [18]:
text_clf = Pipeline([('vect', vectorizer), 
                     ('tfidf', tfidf_transformer),
                     ('clf', MLPClassifier(hidden_layer_sizes=(4096, 1024), tol=0.0001, early_stopping=True, validation_fraction=0.1, verbose=True, random_state=1))
                    ])

In [19]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

Iteration 1, loss = 7.71963544
Validation score: 0.417884
Iteration 2, loss = 3.51733982
Validation score: 0.542373
Iteration 3, loss = 1.53049433
Validation score: 0.548802
Iteration 4, loss = 0.61838623
Validation score: 0.556400
Iteration 5, loss = 0.31824075
Validation score: 0.552893
Iteration 6, loss = 0.23207093
Validation score: 0.537697
Iteration 7, loss = 0.21266148
Validation score: 0.534775
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [20]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['und', 'die', 'der'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(4096, 1024), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], 'vect': CountVectorizer(analyzer

In [21]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [22]:
# precision is a measure of result relevancy
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8596379037823638


  'precision', 'predicted', average, warn_for)


In [23]:
# recall is a measure of how many truly relevant results are returned
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.8138036428534324


In [24]:
# F1 score is a weighted average of the precision and recall
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.8247182393457454


  'precision', 'predicted', average, warn_for)


In [25]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)

In [16]:
# write first parameters and scores to file
"""
#with open(output+'/MLP_reduced_labels_first_params.txt',"w+", encoding="utf8") as params:
with open(output+'/MLP_reduced_labels_first_params_max_features.txt',"w+", encoding="utf8") as params:
    params.write("First parameters for classification with MLP (reduced labels):")
    params.write("\nprocessing_time: %s" % processing_time)
    for key, value in clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)"""

'\n#with open(output+\'/MLP_reduced_labels_first_params.txt\',"w+", encoding="utf8") as params:\nwith open(output+\'/MLP_reduced_labels_first_params_max_features.txt\',"w+", encoding="utf8") as params:\n    params.write("First parameters for classification with MLP (reduced labels):")\n    params.write("\nprocessing_time: %s" % processing_time)\n    for key, value in clf_params.items():\n        params.write("\n%s: %s" % (key, value))\n    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    \n    params.write("\nprecision: %s" % precision)\n    params.write("\nrecall: %s" % recall)\n    params.write("\nf1-score: %s" % f1)'

In [26]:
# write parameters and scores to file

with open(output+'/MLP_reduced_labels_params.txt',"a", encoding="utf8") as params:
    params.write("\n*********************************************************************************************")
    params.write("\nParameters for classification with MLP (reduced labels):")
    params.write("\n*********************************************************************************************")
    params.write("\n%s" % text_clf.named_steps.vect)
    params.write("\n%s" % text_clf.named_steps.tfidf)
    params.write("\n%s" % text_clf.named_steps.clf)
    #for key, value in clf_params.items():
        #params.write("\n%s: %s" % (key, value))
    params.write("\nclasses: %s" % text_clf.named_steps.clf.n_outputs_)
    params.write("\nlayers: %s" % text_clf.named_steps.clf.n_layers_)
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_) 
    params.write("\nepochs: %s" % text_clf.named_steps.clf.n_iter_)
    params.write("\nprocessing time: %s" % processing_time)
    params.write("\nSCORES:")
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)
    params.write("\n")

In [27]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_reduced_labels_predictions.txt',"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (reduced labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('histoire et archéologie_d', 'histoire_t', 'pluridisciplinarité_d', 'épistémologie et méthodes_t')
TRUE:
['pluridisciplinarité_d', 'épistémologie et méthodes_t', 'histoire_t', 'histoire et archéologie_d']


Speicherung der vektorisierten Textdaten

In [28]:
z_train = [e.replace('.txt', '') for e in z_train]
z_test = [e.replace('.txt', '') for e in z_test]
ident_train = [e.replace('_', '.hypotheses.org/') for e in z_train]
ident_test = [e.replace('_', '.hypotheses.org/') for e in z_test]

print(len(ident_train))
print(ident_train[0])

17109
archivalia.hypotheses.org/575


In [29]:
# vectorize textdata
train_vect = vectorizer.transform(X_train)
train_tfidf = tfidf_transformer.transform(train_vect)
print(train_tfidf.shape)

test_vect = vectorizer.transform(X_test)
test_tfidf = tfidf_transformer.transform(test_vect)

(17109, 10000)


In [30]:
print(type(test_tfidf))
train_tfidf

<class 'scipy.sparse.csr.csr_matrix'>


<17109x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2386946 stored elements in Compressed Sparse Row format>

In [31]:
# save filename, classes, textvectors in csv file
# trainset
# speichert vektorisierten Text
output_file_train = 'Datasets/reduced_labels_train_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_train, train_tfidf)

# speichert filenames und classes
with open('../Datasets/reduced_labels_train_idents_labels.csv', 'w', newline='', encoding="utf-8") as traincsv:
    train = csv.writer(traincsv, delimiter = ";")
    train.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_train, y_train):
        labellist = ", ".join(labels)
        train.writerow([ident, labellist, output_file_train])

# testset
# speichert vektorisierten Text
output_file_test = 'Datasets/reduced_labels_test_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_test, test_tfidf)

# speichert filenames und classes
with open('../Datasets/reduced_labels_test_idents_labels.csv', 'w', newline='', encoding="utf-8") as testcsv:
    test = csv.writer(testcsv, delimiter = ";")
    test.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_test, y_test):
        labellist = ", ".join(labels)
        test.writerow([ident, labellist, output_file_test])

Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [6]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier(validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                ])

In [7]:
# parameter tuning with RandomSearch
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
rs_parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
                 #'vect__max_df' : (0.7, 0.8, 0.85, 0.9, 0.95), #1.0 
                 #'vect__min_df' : (0.01, 0.025, 0.05, 0.075, 0.1, 0.2), #0.0
                 'vect__max_features': (100000,50000,25000,10000,7500,5000,2500,1000,500,300,100), 
                 'tfidf__use_idf': (True, False),
                 'clf__hidden_layer_sizes': ((2048,1024),(2048,512),(1024,512),(512,128),(4096,1024),(4096,512),(2048,1024,512),(1024,512,128))
                }

In [8]:
# train
rs_clf = RandomizedSearchCV(clf, rs_parameters, cv=2, n_iter=10, n_jobs=1, verbose=10, random_state=1)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 15.12070958
Validation score: 0.252336
Iteration 2, loss = 9.24104634
Validation score: 0.233645
Iteration 3, loss = 7.09170742
Validation score: 0.369159
Iteration 4, loss = 5.57069004
Validation score: 0.373832
Iteration 5, loss = 4.31373701
Validation score: 0.401869
Iteration 6, loss = 3.07877318
Validation score: 0.406542
Iteration 7, loss = 1.97087511
Validation score: 0.406542
Iteration 8, loss = 1.13798802
Validation score: 0.406542
Iteration 9, loss = 0.62190408
Validation score: 0.392523
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.423562412342216, total=18.0min
[CV] vect__ngram_range=(1, 3), vect__max_feature

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 18.1min remaining:    0.0s


Iteration 1, loss = 15.01348098
Validation score: 0.331776
Iteration 2, loss = 9.08658896
Validation score: 0.242991
Iteration 3, loss = 6.95678728
Validation score: 0.392523
Iteration 4, loss = 5.43340642
Validation score: 0.397196
Iteration 5, loss = 4.20915201
Validation score: 0.439252
Iteration 6, loss = 3.06746001
Validation score: 0.425234
Iteration 7, loss = 2.04330463
Validation score: 0.439252
Iteration 8, loss = 1.22044929
Validation score: 0.425234
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.4123422159887798, total=16.1min
[CV] vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 34.3min remaining:    0.0s


Iteration 1, loss = 14.39995582
Validation score: 0.228972
Iteration 2, loss = 9.09420726
Validation score: 0.182243
Iteration 3, loss = 7.82584241
Validation score: 0.303738
Iteration 4, loss = 6.92446330
Validation score: 0.359813
Iteration 5, loss = 6.30585603
Validation score: 0.378505
Iteration 6, loss = 5.73371433
Validation score: 0.359813
Iteration 7, loss = 5.15573059
Validation score: 0.392523
Iteration 8, loss = 4.52645966
Validation score: 0.392523
Iteration 9, loss = 3.89004369
Validation score: 0.383178
Iteration 10, loss = 3.26583304
Validation score: 0.397196
Iteration 11, loss = 2.70945162
Validation score: 0.378505
Iteration 12, loss = 2.18720186
Validation score: 0.387850
Iteration 13, loss = 1.73991232
Validation score: 0.397196
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.3856942496493689, tota

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 36.8min remaining:    0.0s


Iteration 1, loss = 14.16998955
Validation score: 0.042056
Iteration 2, loss = 8.82424272
Validation score: 0.214953
Iteration 3, loss = 7.47789710
Validation score: 0.228972
Iteration 4, loss = 6.53883306
Validation score: 0.317757
Iteration 5, loss = 5.87674678
Validation score: 0.350467
Iteration 6, loss = 5.29483122
Validation score: 0.345794
Iteration 7, loss = 4.76255494
Validation score: 0.341121
Iteration 8, loss = 4.22638375
Validation score: 0.350467
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.38242169237961665, total= 1.5min
[CV] vect__ngram_range=(1, 4), vect__max_features=5000, tfidf__use_idf=False, clf__hidden_layer_sizes=(512, 128) 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 38.3min remaining:    0.0s


Iteration 1, loss = 22.47471053
Validation score: 0.004673
Iteration 2, loss = 15.84262625
Validation score: 0.000000
Iteration 3, loss = 10.03718067
Validation score: 0.285047
Iteration 4, loss = 9.13126051
Validation score: 0.000000
Iteration 5, loss = 8.59102419
Validation score: 0.182243
Iteration 6, loss = 8.19559289
Validation score: 0.214953
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 4), vect__max_features=5000, tfidf__use_idf=False, clf__hidden_layer_sizes=(512, 128), score=0.3010752688172043, total=  21.7s
[CV] vect__ngram_range=(1, 4), vect__max_features=5000, tfidf__use_idf=False, clf__hidden_layer_sizes=(512, 128) 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 38.7min remaining:    0.0s


Iteration 1, loss = 22.51899479
Validation score: 0.000000
Iteration 2, loss = 15.82219651
Validation score: 0.004673
Iteration 3, loss = 9.79089336
Validation score: 0.214953
Iteration 4, loss = 8.77919503
Validation score: 0.004673
Iteration 5, loss = 8.25845709
Validation score: 0.023364
Iteration 6, loss = 7.88158931
Validation score: 0.219626
Iteration 7, loss = 7.53577248
Validation score: 0.191589
Iteration 8, loss = 7.17905569
Validation score: 0.336449
Iteration 9, loss = 6.81807363
Validation score: 0.327103
Iteration 10, loss = 6.46259435
Validation score: 0.341121
Iteration 11, loss = 6.11634956
Validation score: 0.355140
Iteration 12, loss = 5.79120908
Validation score: 0.378505
Iteration 13, loss = 5.48997100
Validation score: 0.401869
Iteration 14, loss = 5.20591130
Validation score: 0.401869
Iteration 15, loss = 4.92926799
Validation score: 0.429907
Iteration 16, loss = 4.66506529
Validation score: 0.429907
Iteration 17, loss = 4.40371282
Validation score: 0.434579
Iter

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 39.6min remaining:    0.0s


Iteration 1, loss = 19.19010335
Validation score: 0.000000
Iteration 2, loss = 10.24073553
Validation score: 0.032710
Iteration 3, loss = 8.82108807
Validation score: 0.074766
Iteration 4, loss = 7.74096819
Validation score: 0.257009
Iteration 5, loss = 6.69332161
Validation score: 0.327103
Iteration 6, loss = 5.85154905
Validation score: 0.345794
Iteration 7, loss = 5.12273967
Validation score: 0.345794
Iteration 8, loss = 4.40320513
Validation score: 0.359813
Iteration 9, loss = 3.68015477
Validation score: 0.397196
Iteration 10, loss = 2.98556641
Validation score: 0.397196
Iteration 11, loss = 2.33673950
Validation score: 0.411215
Iteration 12, loss = 1.79069529
Validation score: 0.415888
Iteration 13, loss = 1.34002318
Validation score: 0.411215
Iteration 14, loss = 0.99009425
Validation score: 0.411215
Iteration 15, loss = 0.72697616
Validation score: 0.415888
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 41.4min remaining:    0.0s


Iteration 1, loss = 19.10933142
Validation score: 0.014019
Iteration 2, loss = 9.76014574
Validation score: 0.224299
Iteration 3, loss = 8.54804985
Validation score: 0.070093
Iteration 4, loss = 7.44903729
Validation score: 0.233645
Iteration 5, loss = 6.39800191
Validation score: 0.341121
Iteration 6, loss = 5.55204112
Validation score: 0.373832
Iteration 7, loss = 4.83752818
Validation score: 0.387850
Iteration 8, loss = 4.14838162
Validation score: 0.397196
Iteration 9, loss = 3.46762536
Validation score: 0.401869
Iteration 10, loss = 2.80204917
Validation score: 0.397196
Iteration 11, loss = 2.18942217
Validation score: 0.401869
Iteration 12, loss = 1.67278726
Validation score: 0.392523
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.4053295932678822, total= 1.4min
[CV] vect__ngram_range=(1, 2), vect__max_features=

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 42.8min remaining:    0.0s


Iteration 1, loss = 19.92482588
Validation score: 0.000000
Iteration 2, loss = 10.28714186
Validation score: 0.056075
Iteration 3, loss = 8.88558999
Validation score: 0.107477
Iteration 4, loss = 7.52675969
Validation score: 0.289720
Iteration 5, loss = 6.33764400
Validation score: 0.378505
Iteration 6, loss = 5.34705915
Validation score: 0.383178
Iteration 7, loss = 4.43916451
Validation score: 0.401869
Iteration 8, loss = 3.58418425
Validation score: 0.406542
Iteration 9, loss = 2.74534615
Validation score: 0.443925
Iteration 10, loss = 1.99240793
Validation score: 0.429907
Iteration 11, loss = 1.38593612
Validation score: 0.448598
Iteration 12, loss = 0.93951869
Validation score: 0.425234
Iteration 13, loss = 0.62652935
Validation score: 0.429907
Iteration 14, loss = 0.42594345
Validation score: 0.429907
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hid

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 46.5min remaining:    0.0s


Iteration 1, loss = 19.78341904
Validation score: 0.014019
Iteration 2, loss = 9.85799114
Validation score: 0.060748
Iteration 3, loss = 8.56935007
Validation score: 0.084112
Iteration 4, loss = 7.22868399
Validation score: 0.308411
Iteration 5, loss = 6.01023061
Validation score: 0.373832
Iteration 6, loss = 4.99303493
Validation score: 0.392523
Iteration 7, loss = 4.11787718
Validation score: 0.420561
Iteration 8, loss = 3.29667575
Validation score: 0.439252
Iteration 9, loss = 2.50458013
Validation score: 0.462617
Iteration 10, loss = 1.80793569
Validation score: 0.457944
Iteration 11, loss = 1.24191355
Validation score: 0.453271
Iteration 12, loss = 0.84497808
Validation score: 0.420561
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.4165497896213184, total= 3.2min
[CV] vect__ngram_range=(1, 4), vect__max_features=

Iteration 8, loss = 3.29933501
Validation score: 0.397196
Iteration 9, loss = 2.55264577
Validation score: 0.415888
Iteration 10, loss = 1.91188189
Validation score: 0.411215
Iteration 11, loss = 1.40030028
Validation score: 0.406542
Iteration 12, loss = 0.99164132
Validation score: 0.392523
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=2500, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 1024), score=0.40766713417484807, total= 2.2min
[CV] vect__ngram_range=(1, 2), vect__max_features=2500, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 14.09429188
Validation score: 0.032710
Iteration 2, loss = 8.67543111
Validation score: 0.214953
Iteration 3, loss = 7.10329494
Validation score: 0.299065
Iteration 4, loss = 6.06989272
Validation score: 0.336449
Iteration 5, loss = 5.30147006
Validation score: 0.355140
Iteration 6, loss = 4.56967449
Validation score: 0

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 83.4min finished


Iteration 1, loss = 11.98153405
Validation score: 0.172897
Iteration 2, loss = 6.83311338
Validation score: 0.413551
Iteration 3, loss = 5.34259274
Validation score: 0.415888
Iteration 4, loss = 3.94714883
Validation score: 0.436916
Iteration 5, loss = 2.56927131
Validation score: 0.455607
Iteration 6, loss = 1.49744491
Validation score: 0.474299
Iteration 7, loss = 0.83678381
Validation score: 0.474299
Iteration 8, loss = 0.46732589
Validation score: 0.469626
Iteration 9, loss = 0.27468449
Validation score: 0.462617
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [9]:
best_score = rs_clf.best_score_
print(best_score)

0.42262739597942967


In [10]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 4), 'vect__max_features': 7500, 'tfidf__use_idf': True, 'clf__hidden_layer_sizes': (4096, 1024)}


In [11]:
rs_clf_params = rs_clf.get_params()
print(rs_clf_params)

{'cv': 2, 'error_score': 'raise', 'estimator__memory': None, 'estimator__steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], '

In [12]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [14]:
# precision is a measure of result relevancy
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

0.8223399547439526


  'precision', 'predicted', average, warn_for)


In [15]:
# recall is a measure of how many truly relevant results are returned
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.7248613465297666


In [16]:
# F1 score is a weighted average of the precision and recall
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)

0.7522693151660362


  'precision', 'predicted', average, warn_for)


In [17]:
print(classification_report(encoded_y_test, rs_predicted))

             precision    recall  f1-score   support

          0       0.66      0.21      0.32       515
          1       1.00      0.02      0.05       173
          2       0.75      0.53      0.62      3892
          3       0.99      0.41      0.58       265
          4       0.80      0.19      0.31      1003
          5       1.00      0.04      0.09       224
          6       1.00      0.01      0.01       173
          7       0.73      0.45      0.56      2592
          8       0.00      0.00      0.00       161
          9       0.89      0.94      0.91     13173
         10       0.88      0.92      0.90     11322
         11       0.73      0.35      0.48      2249
         12       0.97      0.10      0.18       398
         13       0.98      0.19      0.32       332
         14       0.00      0.00      0.00       162
         15       0.82      0.33      0.47      1286
         16       0.65      0.18      0.28       770
         17       0.82      0.82      0.82   

  'precision', 'predicted', average, warn_for)


Ergebnisse in Dateien speichern

In [18]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [19]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_reducedlabels_rs_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (reduced labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('histoire et archéologie_d', 'histoire_t', 'pluridisciplinarité_d', 'épistémologie et méthodes_t')
TRUE:
['pluridisciplinarité_d', 'épistémologie et méthodes_t', 'histoire_t', 'histoire et archéologie_d']


In [20]:
# write parameters and scores to file

with open(output+'/MLP_reducedlabels_rs_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with Multi-Layer-Perceptron and vectorization in scikit-learn from randomized search (reduced labels):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    params.write("\nparams:")
    for key, value in rs_clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest params:")
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [21]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/MLP_reduced_labels_rs_results_%s.csv' % timestamp, encoding='utf-8')

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0    1021.157406     57.766818         3.453526        0.015617   
1     116.286588     27.620311         2.453402        0.015627   
2      32.347425     12.563904         2.023651        0.039051   
3      95.268586     10.196465         1.343918        0.015611   
4     201.663418     13.282741         1.445475        0.054693   
5     223.447138     10.907479         3.297263        0.062524   
6       5.922545      0.750085         0.617257        0.054694   
7      27.104635      9.930817         1.742378        0.085952   
8     589.957340     29.487709         1.742385        0.007812   
9     146.305600     15.822109         1.984600        0.015627   

  param_vect__ngram_range param_vect__max_features param_tfidf__use_idf  \
0                  (1, 3)                    50000                False   
1                  (1, 3)                     2500                False   
2                  (1, 4)            

