# Textklassifikation mit Vektorisierung in scikit-learn und MLPClassifier 
Labels sind auf Disziplinen reduziert (disciplines_only)

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import csv
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neural_network import MLPClassifier # MultiLayerPerceptron classifier 
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/disciplines_only_trainset.csv' 
testset = '../Datasets/disciplines_only_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

archivewk1_2023.txt
['histoire et archéologie_d']
stadtarchiv goch bestand völcker janssen niederrheinisches volksblatt gocher zeitung vom lokalteil foto lazarett wilhelm anton hospital nachdem ein weiterer verwundetentransport eingetroffen ist werden verwundete im wilhelm anton hospital gepflegt ein neuer verwundetentransport ist gestern abend wieder hier angekommen damit ist die zahl der verwundeten die im wilhelm anton hospital aufnahme gefunden haben auf gestiegen mit der einrichtung des van gulikschen hauses in der brückenstraße für die aufnahme von verwundeten ist begonnen worden und dürfte im laufe der nächsten woche als lazarett fertig eingerichtet werden


k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])


[0 0 0 1 0 0 0 0 0 0 0 0 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

14
0 administration publique et développement_d
1 arts et humanités_d
2 bibliothéconomie_d
3 histoire et archéologie_d
4 langue et linguistique_d
5 littérature_d
6 pluridisciplinarité_d
7 psychologie_d
8 sciences de l'information et de la communication_d
9 sciences de la santé et de la santé publique_d
10 sciences politiques_d
11 sociologie et anthropologie_d
12 travail social et politique sociale_d
13 éducation_d


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [6]:
max_features = 10000
stopwords = open('../Preprocessing/filtered_words.txt','r', encoding='utf-8').read().splitlines()
vectorizer = CountVectorizer(ngram_range=(1,1),  max_features=max_features, stop_words=stopwords)
tfidf_transformer = TfidfTransformer(use_idf=True)

In [67]:
"""# first try with best params for vect and tfidf from kNN classification
text_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert
                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                    ])"""

"# first try with best params for vect and tfidf from kNN classification\ntext_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert\n                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),\n                     ('tfidf', TfidfTransformer(use_idf=True)),\n                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),\n                    ])"

In [17]:
text_clf = Pipeline([('vect', vectorizer),
                     ('tfidf', tfidf_transformer),
                     ('clf', MLPClassifier(hidden_layer_sizes=(2048, 512), tol=0.0001, early_stopping=True, validation_fraction=0.1, verbose=True, random_state=1))
                    ])

In [18]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

Iteration 1, loss = 3.16798568
Validation score: 0.617681
Iteration 2, loss = 1.34563578
Validation score: 0.697892
Iteration 3, loss = 0.59118765
Validation score: 0.698478
Iteration 4, loss = 0.26165635
Validation score: 0.690867
Iteration 5, loss = 0.14563018
Validation score: 0.680328
Iteration 6, loss = 0.11413968
Validation score: 0.686183
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [19]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['und', 'die', 'der'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(2048, 512), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], 'vect': CountVectorizer(analyzer=

In [20]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [21]:
# precision is a measure of result relevancy
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8613517521267462


In [22]:
# recall is a measure of how many truly relevant results are returned
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.8411535159603527


In [23]:
# F1 score is a weighted average of the precision and recall
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.8348383152527381


In [24]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [25]:
"""# write first parameters and scores to file

#with open(output+'/MLP_disciplines_only_first_params.txt',"w+", encoding="utf8") as params:
with open(output+'/MLP_disciplines_only_first_params_max_features.txt',"w+", encoding="utf8") as params:
    params.write("First parameters for classification with MLP (disciplines only):")
    params.write("\nprocessing_time: %s" % processing_time)
    for key, value in clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nactivation function output layer: %s" %  text_clf.named_steps.clf.out_activation_)    
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)"""

'# write first parameters and scores to file\n\n#with open(output+\'/MLP_disciplines_only_first_params.txt\',"w+", encoding="utf8") as params:\nwith open(output+\'/MLP_disciplines_only_first_params_max_features.txt\',"w+", encoding="utf8") as params:\n    params.write("First parameters for classification with MLP (disciplines only):")\n    params.write("\nprocessing_time: %s" % processing_time)\n    for key, value in clf_params.items():\n        params.write("\n%s: %s" % (key, value))\n    params.write("\nactivation function output layer: %s" %  text_clf.named_steps.clf.out_activation_)    \n    params.write("\nprecision: %s" % precision)\n    params.write("\nrecall: %s" % recall)\n    params.write("\nf1-score: %s" % f1)'

In [26]:
# write parameters and scores to file

with open(output+'/MLP_disciplines_only_params.txt',"a", encoding="utf8") as params:
    params.write("\n*********************************************************************************************")
    params.write("\nParameters for classification with MLP (disciplines only):")
    params.write("\n*********************************************************************************************")
    params.write("\n%s" % text_clf.named_steps.vect)
    params.write("\n%s" % text_clf.named_steps.tfidf)
    params.write("\n%s" % text_clf.named_steps.clf)
    #for key, value in clf_params.items():
        #params.write("\n%s: %s" % (key, value))
    params.write("\nclasses: %s" % text_clf.named_steps.clf.n_outputs_)
    params.write("\nlayers: %s" % text_clf.named_steps.clf.n_layers_)
    params.write("\nactivation function output layer: %s" %  text_clf.named_steps.clf.out_activation_) 
    params.write("\nepochs: %s" % text_clf.named_steps.clf.n_iter_)
    params.write("\nprocessing time: %s" % processing_time)
    params.write("\nSCORES:")
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)
    params.write("\n")

In [27]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_disciplines_only_predictions.txt',"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (disciplines only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('histoire et archéologie_d',)
TRUE:
['pluridisciplinarité_d', 'histoire et archéologie_d']


Speicherung der vektorisierten Textdaten

In [28]:
z_train = [e.replace('.txt', '') for e in z_train]
z_test = [e.replace('.txt', '') for e in z_test]
ident_train = [e.replace('_', '.hypotheses.org/') for e in z_train]
ident_test = [e.replace('_', '.hypotheses.org/') for e in z_test]

print(len(ident_train))
print(ident_train[0])

17080
archivewk1.hypotheses.org/2023


In [29]:
# vectorize textdata
train_vect = vectorizer.transform(X_train)
train_tfidf = tfidf_transformer.transform(train_vect)
print(train_tfidf.shape)

test_vect = vectorizer.transform(X_test)
test_tfidf = tfidf_transformer.transform(test_vect)

(17080, 10000)


In [30]:
print(type(test_tfidf))
train_tfidf

<class 'scipy.sparse.csr.csr_matrix'>


<17080x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2373689 stored elements in Compressed Sparse Row format>

In [31]:
# save filename, classes, textvectors in csv file
# trainset
# speichert vektorisierten Text
output_file_train = 'Datasets/disciplines_only_train_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_train, train_tfidf)

# speichert filenames und classes
with open('../Datasets/disciplines_only_train_idents_labels.csv', 'w', newline='', encoding="utf-8") as traincsv:
    train = csv.writer(traincsv, delimiter = ";")
    train.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_train, y_train):
        labellist = ", ".join(labels)
        train.writerow([ident, labellist, output_file_train])

# testset
# speichert vektorisierten Text
output_file_test = 'Datasets/disciplines_only_test_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_test, test_tfidf)

# speichert filenames und classes
with open('../Datasets/disciplines_only_test_idents_labels.csv', 'w', newline='', encoding="utf-8") as testcsv:
    test = csv.writer(testcsv, delimiter = ";")
    test.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_test, y_test):
        labellist = ", ".join(labels)
        test.writerow([ident, labellist, output_file_test])

Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [65]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier(validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                ])

In [66]:
# parameter tuning with RandomSearch
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
rs_parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
                 #'vect__max_df' : (0.7, 0.8, 0.85, 0.9, 0.95), #1.0 
                 #'vect__min_df' : (0.01, 0.025, 0.05, 0.075, 0.1, 0.2), #0.0
                 'vect__max_features': (100000,7500,50000,25000,10000,5000,2500,1000,500,300,100), 
                 'tfidf__use_idf': (True, False),
                 'clf__hidden_layer_sizes': ((2048,1024),(2048,512),(1024,512),(512,128),(4096,1024),(4096,512),(2048,1024,512),(1024,512,128))
                }

In [68]:
# train
rs_clf = RandomizedSearchCV(clf, rs_parameters, cv=2, n_iter=10, n_jobs=1, verbose=10, random_state=1)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] vect__ngram_range=(1, 3), vect__max_features=7500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 5.33313601
Validation score: 0.345794
Iteration 2, loss = 3.31710757
Validation score: 0.528037
Iteration 3, loss = 2.52894313
Validation score: 0.542056
Iteration 4, loss = 1.96432080
Validation score: 0.593458
Iteration 5, loss = 1.49209314
Validation score: 0.565421
Iteration 6, loss = 1.07256055
Validation score: 0.556075
Iteration 7, loss = 0.72331682
Validation score: 0.570093
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=7500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.5351123595505618, total= 2.5min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min remaining:    0.0s


[CV] vect__ngram_range=(1, 3), vect__max_features=7500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 5.50608338
Validation score: 0.214953
Iteration 2, loss = 3.60370046
Validation score: 0.434579
Iteration 3, loss = 2.76881989
Validation score: 0.462617
Iteration 4, loss = 2.26087348
Validation score: 0.500000
Iteration 5, loss = 1.76660575
Validation score: 0.500000
Iteration 6, loss = 1.31323577
Validation score: 0.546729
Iteration 7, loss = 0.90371415
Validation score: 0.537383
Iteration 8, loss = 0.56291376
Validation score: 0.560748
Iteration 9, loss = 0.32241808
Validation score: 0.570093
Iteration 10, loss = 0.18187342
Validation score: 0.546729
Iteration 11, loss = 0.10825919
Validation score: 0.551402
Iteration 12, loss = 0.06586573
Validation score: 0.560748
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=7500, tfidf__use_idf=False, clf__hidden_

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.3min remaining:    0.0s


[CV] vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 5.29870823
Validation score: 0.364486
Iteration 2, loss = 3.31106911
Validation score: 0.467290
Iteration 3, loss = 2.73045922
Validation score: 0.467290
Iteration 4, loss = 2.34032261
Validation score: 0.504673
Iteration 5, loss = 2.00753216
Validation score: 0.528037
Iteration 6, loss = 1.68956904
Validation score: 0.570093
Iteration 7, loss = 1.37429917
Validation score: 0.579439
Iteration 8, loss = 1.09930940
Validation score: 0.537383
Iteration 9, loss = 0.83960148
Validation score: 0.556075
Iteration 10, loss = 0.57336039
Validation score: 0.584112
Iteration 11, loss = 0.39075777
Validation score: 0.574766
Iteration 12, loss = 0.26591486
Validation score: 0.584112
Iteration 13, loss = 0.18673769
Validation score: 0.579439
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), 

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  9.8min remaining:    0.0s


Iteration 1, loss = 5.35328429
Validation score: 0.299065
Iteration 2, loss = 3.54180998
Validation score: 0.373832
Iteration 3, loss = 2.90447722
Validation score: 0.420561
Iteration 4, loss = 2.49454053
Validation score: 0.439252
Iteration 5, loss = 2.13254880
Validation score: 0.453271
Iteration 6, loss = 1.78016769
Validation score: 0.476636
Iteration 7, loss = 1.43439968
Validation score: 0.462617
Iteration 8, loss = 1.08763751
Validation score: 0.500000
Iteration 9, loss = 0.77350251
Validation score: 0.495327
Iteration 10, loss = 0.53524933
Validation score: 0.485981
Iteration 11, loss = 0.36654226
Validation score: 0.485981
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.5377049180327869, total= 2.1min
[CV] vect__ngram_range=(1, 4), vect__max_features=5000, tfidf__use_idf=False, clf__hidden_layer_sizes=(512, 1

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 11.9min remaining:    0.0s


Iteration 1, loss = 8.83581379
Validation score: 0.383178
Iteration 2, loss = 5.41196698
Validation score: 0.271028
Iteration 3, loss = 3.42851509
Validation score: 0.406542
Iteration 4, loss = 3.26281754
Validation score: 0.514019
Iteration 5, loss = 3.02271073
Validation score: 0.514019
Iteration 6, loss = 2.81014140
Validation score: 0.532710
Iteration 7, loss = 2.59534352
Validation score: 0.518692
Iteration 8, loss = 2.38309032
Validation score: 0.504673
Iteration 9, loss = 2.18400242
Validation score: 0.537383
Iteration 10, loss = 2.00276074
Validation score: 0.532710
Iteration 11, loss = 1.83457190
Validation score: 0.542056
Iteration 12, loss = 1.67196940
Validation score: 0.542056
Iteration 13, loss = 1.52242310
Validation score: 0.532710
Iteration 14, loss = 1.37428019
Validation score: 0.537383
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 4), vect__max_features=5000, tfidf__use_idf=False, clf__hidde

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.6min remaining:    0.0s


Iteration 1, loss = 8.81824255
Validation score: 0.313084
Iteration 2, loss = 5.44619390
Validation score: 0.219626
Iteration 3, loss = 3.68119501
Validation score: 0.350467
Iteration 4, loss = 3.49287820
Validation score: 0.415888
Iteration 5, loss = 3.23791914
Validation score: 0.481308
Iteration 6, loss = 3.02521719
Validation score: 0.462617
Iteration 7, loss = 2.80466927
Validation score: 0.471963
Iteration 8, loss = 2.58664498
Validation score: 0.485981
Iteration 9, loss = 2.37982389
Validation score: 0.509346
Iteration 10, loss = 2.18614167
Validation score: 0.509346
Iteration 11, loss = 2.00643994
Validation score: 0.514019
Iteration 12, loss = 1.82585279
Validation score: 0.523364
Iteration 13, loss = 1.65728088
Validation score: 0.509346
Iteration 14, loss = 1.49481943
Validation score: 0.514019
Iteration 15, loss = 1.33788840
Validation score: 0.514019
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 4)

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 13.2min remaining:    0.0s


Iteration 1, loss = 7.79836566
Validation score: 0.238318
Iteration 2, loss = 3.68475211
Validation score: 0.439252
Iteration 3, loss = 3.19411326
Validation score: 0.574766
Iteration 4, loss = 2.50451381
Validation score: 0.574766
Iteration 5, loss = 1.86278211
Validation score: 0.612150
Iteration 6, loss = 1.35187616
Validation score: 0.612150
Iteration 7, loss = 0.93871513
Validation score: 0.635514
Iteration 8, loss = 0.62110331
Validation score: 0.635514
Iteration 9, loss = 0.37315331
Validation score: 0.640187
Iteration 10, loss = 0.21470657
Validation score: 0.644860
Iteration 11, loss = 0.11906382
Validation score: 0.644860
Iteration 12, loss = 0.07151888
Validation score: 0.640187
Iteration 13, loss = 0.04545086
Validation score: 0.630841
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.5688202247191011, total=

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 16.7min remaining:    0.0s


Iteration 1, loss = 7.84921997
Validation score: 0.261682
Iteration 2, loss = 3.90872060
Validation score: 0.457944
Iteration 3, loss = 3.34325442
Validation score: 0.415888
Iteration 4, loss = 2.65124140
Validation score: 0.490654
Iteration 5, loss = 1.96908431
Validation score: 0.490654
Iteration 6, loss = 1.40559253
Validation score: 0.528037
Iteration 7, loss = 0.93430085
Validation score: 0.528037
Iteration 8, loss = 0.57339464
Validation score: 0.523364
Iteration 9, loss = 0.33957077
Validation score: 0.537383
Iteration 10, loss = 0.19413274
Validation score: 0.537383
Iteration 11, loss = 0.11308101
Validation score: 0.546729
Iteration 12, loss = 0.06876573
Validation score: 0.546729
Iteration 13, loss = 0.04335390
Validation score: 0.546729
Iteration 14, loss = 0.02932721
Validation score: 0.546729
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hidde

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 20.4min remaining:    0.0s


Iteration 1, loss = 7.18809883
Validation score: 0.252336
Iteration 2, loss = 3.59460795
Validation score: 0.481308
Iteration 3, loss = 2.91145859
Validation score: 0.560748
Iteration 4, loss = 2.12055872
Validation score: 0.612150
Iteration 5, loss = 1.46224997
Validation score: 0.630841
Iteration 6, loss = 0.98694384
Validation score: 0.621495
Iteration 7, loss = 0.64894409
Validation score: 0.640187
Iteration 8, loss = 0.38340985
Validation score: 0.630841
Iteration 9, loss = 0.20850616
Validation score: 0.644860
Iteration 10, loss = 0.10819835
Validation score: 0.640187
Iteration 11, loss = 0.06416286
Validation score: 0.635514
Iteration 12, loss = 0.03815432
Validation score: 0.649533
Iteration 13, loss = 0.02924462
Validation score: 0.640187
Iteration 14, loss = 0.02043942
Validation score: 0.640187
Iteration 15, loss = 0.01713707
Validation score: 0.654206
Iteration 16, loss = 0.01459405
Validation score: 0.635514
Iteration 17, loss = 0.01287035
Validation score: 0.640187
Iterat

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 29.4min remaining:    0.0s


Iteration 1, loss = 7.29522349
Validation score: 0.247664
Iteration 2, loss = 3.84663528
Validation score: 0.434579
Iteration 3, loss = 3.07404664
Validation score: 0.537383
Iteration 4, loss = 2.26939009
Validation score: 0.556075
Iteration 5, loss = 1.54583149
Validation score: 0.574766
Iteration 6, loss = 1.00653019
Validation score: 0.556075
Iteration 7, loss = 0.61232715
Validation score: 0.593458
Iteration 8, loss = 0.32599035
Validation score: 0.593458
Iteration 9, loss = 0.17087228
Validation score: 0.612150
Iteration 10, loss = 0.09177163
Validation score: 0.598131
Iteration 11, loss = 0.05337019
Validation score: 0.588785
Iteration 12, loss = 0.03552243
Validation score: 0.588785
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=50000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.6023419203747072, total= 6.0min
[CV] vect__ngram_range=(1, 4), vect__max_features=1

Iteration 3, loss = 2.13359146
Validation score: 0.621495
Iteration 4, loss = 1.25452738
Validation score: 0.630841
Iteration 5, loss = 0.68386318
Validation score: 0.649533
Iteration 6, loss = 0.30442466
Validation score: 0.640187
Iteration 7, loss = 0.11746369
Validation score: 0.649533
Iteration 8, loss = 0.05038857
Validation score: 0.621495
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 1), vect__max_features=50000, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 1024), score=0.5981264637002341, total=15.9min
[CV] vect__ngram_range=(1, 2), vect__max_features=2500, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 5.29970598
Validation score: 0.467290
Iteration 2, loss = 3.21261873
Validation score: 0.467290
Iteration 3, loss = 2.53431133
Validation score: 0.500000
Iteration 4, loss = 2.06359254
Validation score: 0.542056
Iteration 5, loss = 1.63052865
Validation score: 0.565

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 83.7min finished


Iteration 1, loss = 5.51267340
Validation score: 0.511682
Iteration 2, loss = 2.96626409
Validation score: 0.530374
Iteration 3, loss = 1.91109901
Validation score: 0.602804
Iteration 4, loss = 1.10886245
Validation score: 0.621495
Iteration 5, loss = 0.53387495
Validation score: 0.642523
Iteration 6, loss = 0.21799897
Validation score: 0.647196
Iteration 7, loss = 0.09365365
Validation score: 0.651869
Iteration 8, loss = 0.05084629
Validation score: 0.651869
Iteration 9, loss = 0.03378924
Validation score: 0.642523
Iteration 10, loss = 0.02722593
Validation score: 0.647196
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [69]:
best_score = rs_clf.best_score_
print(best_score)

0.5832357761648326


In [70]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 2), 'vect__max_features': 50000, 'tfidf__use_idf': True, 'clf__hidden_layer_sizes': (1024, 512)}


In [71]:
rs_clf_params = rs_clf.get_params()
print(rs_clf_params)

{'cv': 2, 'error_score': 'raise', 'estimator__memory': None, 'estimator__steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], '

In [72]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [74]:
# precision is a measure of result relevancy
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

0.8365339578454333


  'precision', 'predicted', average, warn_for)


In [75]:
# recall is a measure of how many truly relevant results are returned
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.7761036299765808


In [76]:
# F1 score is a weighted average of the precision and recall
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)
#49,46

0.7852556596409056


  'precision', 'predicted', average, warn_for)


In [77]:
print(classification_report(encoded_y_test, rs_predicted))

             precision    recall  f1-score   support

          0       0.92      0.07      0.13       514
          1       0.77      0.62      0.69      3890
          2       0.87      0.11      0.20      1001
          3       0.89      0.96      0.92     13175
          4       1.00      0.16      0.28       331
          5       1.00      0.02      0.04       163
          6       0.87      0.81      0.84      8507
          7       0.93      0.19      0.32       202
          8       0.88      0.12      0.21      1001
          9       1.00      0.14      0.25       169
         10       0.92      0.07      0.13       514
         11       0.96      0.29      0.44      1153
         12       0.99      0.41      0.58       442
         13       0.96      0.34      0.50       809

avg / total       0.88      0.73      0.75     31871



Ergebnisse in Dateien speichern

In [78]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [79]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_disciplines_only_rs_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (disciplines only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('histoire et archéologie_d',)
TRUE:
['histoire et archéologie_d']


In [80]:
# write parameters and scores to file

with open(output+'/MLP_disciplines_only_rs_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with Multi-Layer-Perceptron and vectorization in scikit-learn from randomized search (disciplines only):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    params.write("\nparams:")
    for key, value in rs_clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest params:")
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [81]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/MLP_disciplines_only_rs_results_%s.csv' % timestamp, encoding='utf-8')

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0     213.751160     62.983465         2.578432        0.312540   
1     133.250301     11.063823         2.422165        0.125015   
2      34.761969      0.257843         2.101830        0.070339   
3     211.533094      6.914888         1.492365        0.085948   
4     447.410220     88.869675         1.672081        0.109373   
5     320.895678     53.615424         3.476956        0.117200   
6      12.595193      0.562547         0.617239        0.023423   
7      33.308454      1.023553         1.750224        0.078108   
8     946.700818      6.047548         1.937708        0.015638   
9     114.841112      4.500508         2.070546        0.054694   

  param_vect__ngram_range param_vect__max_features param_tfidf__use_idf  \
0                  (1, 3)                     7500                False   
1                  (1, 3)                     2500                False   
2                  (1, 4)            

