# Textklassifikation mit Vektorisierung in scikit-learn und MLPClassifier 
Labels sind auf Themen reduziert (themes_only)

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import csv
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neural_network import MLPClassifier # MultiLayerPerceptron classifier 
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/themes_only_trainset.csv' 
testset = '../Datasets/themes_only_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

ordensgeschichte_6160.txt
['histoire_t', 'époque moderne_t', 'moyen âge_t']
das colloquium historicum wirsbergense e v chw lädt für freitag und samstag oktober zu einem wissenschaftlichen symposium in den kastenhof weismain lkr lichtenfels ein im mittelpunkt der tagung steht mauritius knauer der vor jahren in der bambergischen stadt weismain geboren wurde er trat in die zisterzienserabtei langheim ein nachdem er während des dreißigjährigen kriegs mehrere jahre in heiligenkreuz und wien verbracht hatte wurde er prior von langheim trat er als abt an die spitze des klosters er starb knauer war universalgelehrter er kämpfte für die rechte seines klosters besaß medizinische kenntnisse schrieb theologische werke und belebte kraftvoll die wirtschaft langheims bis heute ist er als verfasser des hundertjährigen kalenders weithin berühmt bei der tagung sollen leben und wirken knauers in größere zusammenhänge eingeordnet werden programm freitag oktober uhr prof dr günter dippold lichtenfels einfü

k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])


[0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

21
0 anthropologie_t
1 asie_t
2 droit_t
3 ethnologie_t
4 europe_t
5 géographie_t
6 histoire_t
7 information_t
8 langage_t
9 moyen âge_t
10 pensée_t
11 psychisme_t
12 religions_t
13 représentations_t
14 sociologie_t
15 éducation_t
16 épistémologie et méthodes_t
17 époque contemporaine_t
18 époque moderne_t
19 études des sciences_t
20 études du politique_t


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [6]:
max_features = 10000
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=max_features, stop_words=stopwords)
tfidf_transformer = TfidfTransformer(use_idf=True)

In [7]:
# first try with best params for vect and tfidf from kNN classification
"""text_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert
                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                    ])"""

"text_clf = Pipeline([#('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert\n                     ('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),\n                     ('tfidf', TfidfTransformer(use_idf=True)),\n                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),\n                    ])"

In [8]:
text_clf = Pipeline([('vect', vectorizer), 
                     ('tfidf', tfidf_transformer),
                     ('clf', MLPClassifier(hidden_layer_sizes=(2048,512), tol=0.0001, early_stopping=True, validation_fraction=0.1, verbose=True, random_state=1))
                    ])

In [9]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

Iteration 1, loss = 4.99724264
Validation score: 0.475745
Iteration 2, loss = 2.26204605
Validation score: 0.606663
Iteration 3, loss = 1.11150565
Validation score: 0.624781
Iteration 4, loss = 0.50408517
Validation score: 0.623027
Iteration 5, loss = 0.24275951
Validation score: 0.614261
Iteration 6, loss = 0.16684479
Validation score: 0.607832
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [10]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['und', 'die', 'der'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(2048, 512), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], 'vect': CountVectorizer(analyzer=

In [11]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [12]:
# precision is a measure of result relevancy
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8621218192746944


  'precision', 'predicted', average, warn_for)


In [13]:
# recall is a measure of how many truly relevant results are returned
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.8009467040673212


In [14]:
# F1 score is a weighted average of the precision and recall
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.8133429541213552


  'precision', 'predicted', average, warn_for)


In [15]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)

In [16]:
"""# write first parameters and scores to file

#with open(output+'/MLP_themes_only_first_params.txt',"w+", encoding="utf8") as params:
with open(output+'/MLP_themes_only_first_params_max_features.txt',"w+", encoding="utf8") as params:
    params.write("First parameters for classification with MLP (themes only):")
    params.write("\nprocessing_time: %s" % processing_time)
    for key, value in clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)"""

'# write first parameters and scores to file\n\n#with open(output+\'/MLP_themes_only_first_params.txt\',"w+", encoding="utf8") as params:\nwith open(output+\'/MLP_themes_only_first_params_max_features.txt\',"w+", encoding="utf8") as params:\n    params.write("First parameters for classification with MLP (themes only):")\n    params.write("\nprocessing_time: %s" % processing_time)\n    for key, value in clf_params.items():\n        params.write("\n%s: %s" % (key, value))\n    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    \n    params.write("\nprecision: %s" % precision)\n    params.write("\nrecall: %s" % recall)\n    params.write("\nf1-score: %s" % f1)'

In [17]:
# write parameters and scores to file

with open(output+'/MLP_themes_only_params.txt',"a", encoding="utf8") as params:
    params.write("\n*********************************************************************************************")
    params.write("\nParameters for classification with MLP (themes only):")
    params.write("\n*********************************************************************************************")
    params.write("\n%s" % text_clf.named_steps.vect)
    params.write("\n%s" % text_clf.named_steps.tfidf)
    params.write("\n%s" % text_clf.named_steps.clf)
    #for key, value in clf_params.items():
        #params.write("\n%s: %s" % (key, value))
    params.write("\nclasses: %s" % text_clf.named_steps.clf.n_outputs_)
    params.write("\nlayers: %s" % text_clf.named_steps.clf.n_layers_)
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_) 
    params.write("\nepochs: %s" % text_clf.named_steps.clf.n_iter_)
    params.write("\nprocessing time: %s" % processing_time)
    params.write("\nSCORES:")
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)
    params.write("\n")

In [18]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_themes_only_predictions.txt',"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (themes only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')

PREDICTED:
('histoire_t', 'époque contemporaine_t')
TRUE:
['époque contemporaine_t', 'histoire_t']


Speicherung der vektorisierten Textdaten

In [19]:
z_train = [e.replace('.txt', '') for e in z_train]
z_test = [e.replace('.txt', '') for e in z_test]
ident_train = [e.replace('_', '.hypotheses.org/') for e in z_train]
ident_test = [e.replace('_', '.hypotheses.org/') for e in z_test]

print(len(ident_train))
print(ident_train[0])

17109
ordensgeschichte.hypotheses.org/6160


In [20]:
# vectorize textdata
train_vect = vectorizer.transform(X_train)
train_tfidf = tfidf_transformer.transform(train_vect)
print(train_tfidf.shape)

test_vect = vectorizer.transform(X_test)
test_tfidf = tfidf_transformer.transform(test_vect)

(17109, 10000)


In [21]:
print(type(test_tfidf))
train_tfidf

<class 'scipy.sparse.csr.csr_matrix'>


<17109x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2389735 stored elements in Compressed Sparse Row format>

In [22]:
# save filename, classes, textvectors in csv file
# trainset
# speichert vektorisierten Text
output_file_train = 'Datasets/themes_only_train_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_train, train_tfidf)

# speichert filenames und classes
with open('../Datasets/themes_only_train_idents_labels.csv', 'w', newline='', encoding="utf-8") as traincsv:
    train = csv.writer(traincsv, delimiter = ";")
    train.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_train, y_train):
        labellist = ", ".join(labels)
        train.writerow([ident, labellist, output_file_train])

# testset
# speichert vektorisierten Text
output_file_test = 'Datasets/themes_only_test_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_test, test_tfidf)

# speichert filenames und classes
with open('../Datasets/themes_only_test_idents_labels.csv', 'w', newline='', encoding="utf-8") as testcsv:
    test = csv.writer(testcsv, delimiter = ";")
    test.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_test, y_test):
        labellist = ", ".join(labels)
        test.writerow([ident, labellist, output_file_test])

Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [6]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier(validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                ])

In [7]:
# parameter tuning with RandomSearch
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
rs_parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
                 #'vect__max_df' : (0.7, 0.8, 0.85, 0.9, 0.95), #1.0 
                 #'vect__min_df' : (0.01, 0.025, 0.05, 0.075, 0.1, 0.2), #0.0
                 'vect__max_features': (100000,50000,25000,10000,7500,5000,2500,1000,500,300,100), 
                 'tfidf__use_idf': (True, False),
                 'clf__hidden_layer_sizes': ((2048,1024),(2048,512),(1024,512),(512,128),(4096,1024),(4096,512),(2048,1024,512),(1024,512,128))
                }

In [10]:
# train
rs_clf = RandomizedSearchCV(clf, rs_parameters, cv=2, n_iter=10, n_jobs=1, verbose=10, random_state=1)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 8.85934247
Validation score: 0.065421
Iteration 2, loss = 5.28424927
Validation score: 0.186916
Iteration 3, loss = 3.84880149
Validation score: 0.341121
Iteration 4, loss = 2.87227908
Validation score: 0.392523
Iteration 5, loss = 1.98288717
Validation score: 0.434579
Iteration 6, loss = 1.24382519
Validation score: 0.453271
Iteration 7, loss = 0.70544635
Validation score: 0.453271
Iteration 8, loss = 0.37320448
Validation score: 0.443925
Iteration 9, loss = 0.18249447
Validation score: 0.448598
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.45582047685834504, total=14.9min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 15.0min remaining:    0.0s


[CV] vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 8.90021524
Validation score: 0.369159
Iteration 2, loss = 5.25294105
Validation score: 0.266355
Iteration 3, loss = 3.83664634
Validation score: 0.341121
Iteration 4, loss = 2.72994203
Validation score: 0.425234
Iteration 5, loss = 1.77956174
Validation score: 0.448598
Iteration 6, loss = 1.03134512
Validation score: 0.457944
Iteration 7, loss = 0.53778869
Validation score: 0.471963
Iteration 8, loss = 0.27010461
Validation score: 0.467290
Iteration 9, loss = 0.14069682
Validation score: 0.467290
Iteration 10, loss = 0.09050792
Validation score: 0.481308
Iteration 11, loss = 0.05512860
Validation score: 0.462617
Iteration 12, loss = 0.04279216
Validation score: 0.490654
Iteration 13, loss = 0.03999546
Validation score: 0.467290
Iteration 14, loss = 0.03706466
Validation score: 0.485981
Iteration 15, loss = 0.03004769
Validation score: 0.481308
Validat

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 39.6min remaining:    0.0s


[CV] vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024) 
Iteration 1, loss = 8.33965188
Validation score: 0.042056
Iteration 2, loss = 5.34418872
Validation score: 0.224299
Iteration 3, loss = 4.40409877
Validation score: 0.350467
Iteration 4, loss = 3.83880245
Validation score: 0.355140
Iteration 5, loss = 3.39383933
Validation score: 0.387850
Iteration 6, loss = 2.93836412
Validation score: 0.392523
Iteration 7, loss = 2.49116517
Validation score: 0.397196
Iteration 8, loss = 2.05921736
Validation score: 0.392523
Iteration 9, loss = 1.64385238
Validation score: 0.397196
Iteration 10, loss = 1.28200432
Validation score: 0.392523
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 3), vect__max_features=2500, tfidf__use_idf=False, clf__hidden_layer_sizes=(4096, 1024), score=0.40860215053763443, total= 1.5min
[CV] vect__ngram_range=(1, 3), vect__max_features=2

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 41.2min remaining:    0.0s


Iteration 1, loss = 8.29670063
Validation score: 0.168224
Iteration 2, loss = 5.36616712
Validation score: 0.238318
Iteration 3, loss = 4.44817882
Validation score: 0.271028
Iteration 4, loss = 3.84141687
Validation score: 0.289720
Iteration 5, loss = 3.37203429
Validation score: 0.317757
Iteration 6, loss = 2.91761168
Validation score: 0.322430
Iteration 7, loss = 2.47129096
Validation score: 0.355140
Iteration 8, loss = 2.05017780
Validation score: 0.359813
Iteration 9, loss = 1.65427666
Validation score: 0.378505
Iteration 10, loss = 1.26835294
Validation score: 0.397196
Iteration 11, loss = 0.97798107
Validation score: 0.401869
Iteration 12, loss = 0.72802888
Validation score: 0.392523
Iteration 13, loss = 0.54551809
Validation score: 0.406542
Iteration 14, loss = 0.40279123
Validation score: 0.397196
Iteration 15, loss = 0.30280283
Validation score: 0.397196
Iteration 16, loss = 0.22626666
Validation score: 0.397196
Validation score did not improve more than tol=0.000100 for two c

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 43.6min remaining:    0.0s


Iteration 1, loss = 13.39800502
Validation score: 0.014019
Iteration 2, loss = 9.09619269
Validation score: 0.014019
Iteration 3, loss = 6.10113403
Validation score: 0.032710
Iteration 4, loss = 5.42841348
Validation score: 0.065421
Iteration 5, loss = 5.08465631
Validation score: 0.219626
Iteration 6, loss = 4.80180043
Validation score: 0.238318
Iteration 7, loss = 4.49469448
Validation score: 0.378505
Iteration 8, loss = 4.19745652
Validation score: 0.411215
Iteration 9, loss = 3.92348671
Validation score: 0.429907
Iteration 10, loss = 3.67178218
Validation score: 0.439252
Iteration 11, loss = 3.44160106
Validation score: 0.453271
Iteration 12, loss = 3.21605738
Validation score: 0.453271
Iteration 13, loss = 2.99103320
Validation score: 0.462617
Iteration 14, loss = 2.77610044
Validation score: 0.467290
Iteration 15, loss = 2.56192770
Validation score: 0.467290
Iteration 16, loss = 2.35848451
Validation score: 0.462617
Iteration 17, loss = 2.16090893
Validation score: 0.462617
Valid

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 44.2min remaining:    0.0s


Iteration 1, loss = 13.39468641
Validation score: 0.014019
Iteration 2, loss = 9.06917057
Validation score: 0.014019
Iteration 3, loss = 6.07206942
Validation score: 0.182243
Iteration 4, loss = 5.40355225
Validation score: 0.028037
Iteration 5, loss = 5.08669843
Validation score: 0.186916
Iteration 6, loss = 4.86012696
Validation score: 0.196262
Iteration 7, loss = 4.57970335
Validation score: 0.228972
Iteration 8, loss = 4.28835697
Validation score: 0.285047
Iteration 9, loss = 4.00554868
Validation score: 0.341121
Iteration 10, loss = 3.73835296
Validation score: 0.350467
Iteration 11, loss = 3.49258402
Validation score: 0.350467
Iteration 12, loss = 3.26051616
Validation score: 0.369159
Iteration 13, loss = 3.03416642
Validation score: 0.383178
Iteration 14, loss = 2.80746159
Validation score: 0.383178
Iteration 15, loss = 2.59761065
Validation score: 0.383178
Iteration 16, loss = 2.39495518
Validation score: 0.387850
Iteration 17, loss = 2.19990160
Validation score: 0.397196
Itera

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 45.0min remaining:    0.0s


Iteration 1, loss = 10.85639133
Validation score: 0.014019
Iteration 2, loss = 5.98910346
Validation score: 0.014019
Iteration 3, loss = 5.18175590
Validation score: 0.210280
Iteration 4, loss = 4.42362015
Validation score: 0.373832
Iteration 5, loss = 3.70880872
Validation score: 0.373832
Iteration 6, loss = 3.12227011
Validation score: 0.369159
Iteration 7, loss = 2.58434754
Validation score: 0.383178
Iteration 8, loss = 2.08561412
Validation score: 0.420561
Iteration 9, loss = 1.62625509
Validation score: 0.425234
Iteration 10, loss = 1.21722551
Validation score: 0.439252
Iteration 11, loss = 0.87386463
Validation score: 0.462617
Iteration 12, loss = 0.61450841
Validation score: 0.457944
Iteration 13, loss = 0.41422801
Validation score: 0.462617
Iteration 14, loss = 0.28201531
Validation score: 0.467290
Iteration 15, loss = 0.18891339
Validation score: 0.462617
Iteration 16, loss = 0.13263297
Validation score: 0.471963
Iteration 17, loss = 0.09587231
Validation score: 0.467290
Itera

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 46.8min remaining:    0.0s


Iteration 1, loss = 10.88796735
Validation score: 0.004673
Iteration 2, loss = 6.02847678
Validation score: 0.014019
Iteration 3, loss = 5.15745311
Validation score: 0.168224
Iteration 4, loss = 4.39075341
Validation score: 0.350467
Iteration 5, loss = 3.66226244
Validation score: 0.387850
Iteration 6, loss = 3.03256807
Validation score: 0.397196
Iteration 7, loss = 2.47105476
Validation score: 0.420561
Iteration 8, loss = 1.97291565
Validation score: 0.439252
Iteration 9, loss = 1.51767633
Validation score: 0.448598
Iteration 10, loss = 1.12257954
Validation score: 0.443925
Iteration 11, loss = 0.80581820
Validation score: 0.434579
Iteration 12, loss = 0.56583214
Validation score: 0.467290
Iteration 13, loss = 0.39339775
Validation score: 0.448598
Iteration 14, loss = 0.27489509
Validation score: 0.453271
Iteration 15, loss = 0.19996871
Validation score: 0.453271
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 48.3min remaining:    0.0s


Iteration 1, loss = 11.33435019
Validation score: 0.018692
Iteration 2, loss = 6.03764422
Validation score: 0.280374
Iteration 3, loss = 5.15689849
Validation score: 0.112150
Iteration 4, loss = 4.30326354
Validation score: 0.406542
Iteration 5, loss = 3.49599438
Validation score: 0.401869
Iteration 6, loss = 2.80338504
Validation score: 0.420561
Iteration 7, loss = 2.17593502
Validation score: 0.439252
Iteration 8, loss = 1.61268850
Validation score: 0.471963
Iteration 9, loss = 1.16728430
Validation score: 0.485981
Iteration 10, loss = 0.79675568
Validation score: 0.490654
Iteration 11, loss = 0.52770556
Validation score: 0.467290
Iteration 12, loss = 0.34477753
Validation score: 0.481308
Iteration 13, loss = 0.22126327
Validation score: 0.471963
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.4544179523141655, total

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 51.1min remaining:    0.0s


Iteration 1, loss = 11.38288603
Validation score: 0.018692
Iteration 2, loss = 5.99082984
Validation score: 0.261682
Iteration 3, loss = 5.17098169
Validation score: 0.200935
Iteration 4, loss = 4.30646853
Validation score: 0.350467
Iteration 5, loss = 3.45755353
Validation score: 0.387850
Iteration 6, loss = 2.71815401
Validation score: 0.434579
Iteration 7, loss = 2.06409155
Validation score: 0.434579
Iteration 8, loss = 1.49491309
Validation score: 0.448598
Iteration 9, loss = 1.03072098
Validation score: 0.457944
Iteration 10, loss = 0.68624883
Validation score: 0.471963
Iteration 11, loss = 0.43623048
Validation score: 0.457944
Iteration 12, loss = 0.27951448
Validation score: 0.453271
Iteration 13, loss = 0.18021092
Validation score: 0.453271
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=25000, tfidf__use_idf=True, clf__hidden_layer_sizes=(1024, 512), score=0.47171575502571295, tota

Iteration 2, loss = 5.18235452
Validation score: 0.294393
Iteration 3, loss = 4.06946609
Validation score: 0.378505
Iteration 4, loss = 3.42844916
Validation score: 0.387850
Iteration 5, loss = 2.83040279
Validation score: 0.397196
Iteration 6, loss = 2.23557077
Validation score: 0.406542
Iteration 7, loss = 1.68436383
Validation score: 0.415888
Iteration 8, loss = 1.21256868
Validation score: 0.425234
Iteration 9, loss = 0.83616224
Validation score: 0.429907
Iteration 10, loss = 0.55335321
Validation score: 0.425234
Iteration 11, loss = 0.35439361
Validation score: 0.425234
Iteration 12, loss = 0.22415836
Validation score: 0.420561
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
[CV]  vect__ngram_range=(1, 2), vect__max_features=2500, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 1024), score=0.45114539504441326, total= 1.8min
[CV] vect__ngram_range=(1, 2), vect__max_features=2500, tfidf__use_idf=True, clf__hidden_layer_sizes=(4096, 

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 81.7min finished


Iteration 1, loss = 7.05014292
Validation score: 0.336449
Iteration 2, loss = 3.86136451
Validation score: 0.411215
Iteration 3, loss = 2.74549708
Validation score: 0.478972
Iteration 4, loss = 1.72115282
Validation score: 0.495327
Iteration 5, loss = 0.92737540
Validation score: 0.514019
Iteration 6, loss = 0.45040736
Validation score: 0.511682
Iteration 7, loss = 0.20937624
Validation score: 0.511682
Iteration 8, loss = 0.11273277
Validation score: 0.500000
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [11]:
best_score = rs_clf.best_score_
print(best_score)

0.47639083683964467


In [12]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 4), 'vect__max_features': 7500, 'tfidf__use_idf': True, 'clf__hidden_layer_sizes': (4096, 1024)}


In [13]:
rs_clf_params = rs_clf.get_params()
print(rs_clf_params)

{'cv': 2, 'error_score': 'raise', 'estimator__memory': None, 'estimator__steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], '

In [14]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [16]:
# precision is a measure of result relevancy
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

0.7979825433007968


  'precision', 'predicted', average, warn_for)


In [17]:
# recall is a measure of how many truly relevant results are returned
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.707436631792234


In [18]:
# F1 score is a weighted average of the precision and recall
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)

0.7282131241187879


  'precision', 'predicted', average, warn_for)


In [19]:
print(classification_report(encoded_y_test, rs_predicted))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       173
          1       1.00      0.49      0.66       265
          2       0.00      0.00      0.00       224
          3       0.00      0.00      0.00       173
          4       0.64      0.50      0.57      2590
          5       0.00      0.00      0.00       161
          6       0.87      0.93      0.90     11321
          7       0.69      0.42      0.52      2248
          8       0.67      0.03      0.05       399
          9       0.80      0.38      0.51      1286
         10       0.85      0.17      0.28       770
         11       0.00      0.00      0.00       202
         12       1.00      0.13      0.23       542
         13       0.71      0.39      0.50      2301
         14       0.86      0.44      0.59      1477
         15       0.94      0.50      0.65       606
         16       0.83      0.77      0.80      7142
         17       0.84      0.76      0.80   

  'precision', 'predicted', average, warn_for)


Ergebnisse in Dateien speichern

In [20]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [21]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_themes_only_rs_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (themes only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('histoire_t',)
TRUE:
['histoire_t', 'époque moderne_t', 'moyen âge_t']


In [22]:
# write parameters and scores to file

with open(output+'/MLP_themes_only_rs_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with Multi-Layer-Perceptron and vectorization in scikit-learn from randomized search (themes only):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    params.write("\nparams:")
    for key, value in rs_clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest params:")
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [23]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/MLP_themes_only_rs_results_%s.csv' % timestamp, encoding='utf-8')



   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0    1183.272370    291.542017         2.687812        0.031244   
1     115.247501     25.221695         1.953352        0.031260   
2      37.715464      6.274170         1.750210        0.015628   
3      96.042729     10.954436         1.070457        0.007830   
4     168.191975      0.093761         1.195456        0.039067   
5     228.214098     18.831045         2.617484        0.007812   
6       4.125466      0.015626         0.562566        0.015629   
7      25.409124     11.345033         1.429849        0.007813   
8     459.773627      0.078133         1.343913        0.000001   
9     100.081618      4.445804         1.531425        0.015628   

  param_vect__ngram_range param_vect__max_features param_tfidf__use_idf  \
0                  (1, 3)                    50000                False   
1                  (1, 3)                     2500                False   
2                  (1, 4)            