# Textklassifikation mit Vektorisierung in scikit-learn und MLPClassifier 
Labels (Themen und Disziplinen) sind nicht reduziert (all_labels)

Autorin: Maria Hartmann

In [13]:
# Imports
import os
import time
import csv
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neural_network import MLPClassifier # MultiLayerPerceptron classifier 
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/all_labels_trainset.csv' 
testset = '../Datasets/all_labels_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

nummer_212.txt
['histoire_d', "sciences de l'information et de la communication_d", 'bibliothéconomie_d', 'histoire_t', 'histoire intellectuelle_t', 'histoire et sociologie des médias_t', 'histoire culturelle_t']
die gemälde der habsburgischen sammlungen zu wien wurden von der stallburg ins belvedere transferiert und dort von christian von mechel neu angeordnet und aufgehängt


k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])


[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

114
0 1914-1918_t
1 1918-1939_t
2 1939-1945_t
3 1945-1989_t
4 administration publique et développement_d
5 anthropologie politique_t
6 approches de corpus_t
7 archives_t
8 archéologie_d
9 arts et humanités_d
10 arts_d
11 asie_t
12 bas moyen âge_t
13 bibliothéconomie_d
14 biomédecine_d
15 chine_t
16 communication_d
17 conflits_t
18 digital humanities_t
19 enquêtes_t
20 europe centrale et orientale_t
21 europe_t
22 france_t
23 guerres_t
24 haut moyen âge_t
25 histoire culturelle_t
26 histoire de l'art_t
27 histoire des religions_t
28 histoire des sciences sociales_d
29 histoire des sciences_t
30 histoire du droit_t
31 histoire et archéologie_d
32 histoire et philosophie des sciences_d
33 histoire et sociologie des médias_t
34 histoire industrielle_t
35 histoire intellectuelle_t
36 histoire politique_t
37 histoire sociale_t
38 histoire urbaine_t
39 histoire économique_t
40 histoire_d
41 histoire_t
42 historiographie_t
43 humanités pluridisciplinaires_d
44 information_t
45 langage_t
46 lan

Vektorisierung und Klassifikation der Daten mit scikit-learn

In [6]:
max_features = 10000
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=max_features, stop_words=stopwords)
tfidf_transformer = TfidfTransformer(use_idf=True)

In [7]:
"""# first try with best params for vect and tfidf from kNN classification
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert
                     #('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                    ])"""

"# first try with best params for vect and tfidf from kNN classification\ntext_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.01)),#min_df=0.0 auf min_df=0.01 geändert\n                     #('vect', CountVectorizer(ngram_range=(1,4), max_features=max_features)),\n                     ('tfidf', TfidfTransformer(use_idf=True)),\n                     ('clf', MLPClassifier(hidden_layer_sizes=(1024,512), max_iter=500, validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),\n                    ])"

In [8]:
text_clf = Pipeline([('vect', vectorizer),
                     ('tfidf', tfidf_transformer),
                     ('clf', MLPClassifier(hidden_layer_sizes=(4096, 1024), tol=0.0001, early_stopping=True, validation_fraction=0.1, verbose=True, random_state=1))
                    ])

In [9]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

Iteration 1, loss = 19.21366700
Validation score: 0.384570
Iteration 2, loss = 9.46798991
Validation score: 0.458796
Iteration 3, loss = 5.49554281
Validation score: 0.513735
Iteration 4, loss = 2.98014910
Validation score: 0.540620
Iteration 5, loss = 1.62345869
Validation score: 0.542957
Iteration 6, loss = 0.94501900
Validation score: 0.544126
Iteration 7, loss = 0.62290702
Validation score: 0.541788
Iteration 8, loss = 0.48465459
Validation score: 0.538866
Iteration 9, loss = 0.40534940
Validation score: 0.542373
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


In [10]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['und', 'die', 'der'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(4096, 1024), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], 'vect': CountVectorizer(analyzer

In [11]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [14]:
# precision is a measure of result relevancy
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8211091373433589


  'precision', 'predicted', average, warn_for)


In [15]:
# recall is a measure of how many truly relevant results are returned
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.7192799817449186


In [16]:
# F1 score is a weighted average of the precision and recall
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.7443742252177321


  'precision', 'predicted', average, warn_for)


In [17]:
out_act = tfidf_output = text_clf.named_steps.clf.out_activation_
print(out_act)

logistic


In [18]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)

In [19]:
# write first parameters and scores to file
"""with open(output+'/MLP_all_labels_first_params.txt',"w+", encoding="utf8") as params:
#with open(output+'/MLP_all_labels_first_params_max_features.txt',"w+", encoding="utf8") as params:
    params.write("First parameters for classification with MLP (all labels):")
    params.write("\nprocessing_time: %s" % processing_time)
    for key, value in clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)"""

'with open(output+\'/MLP_all_labels_first_params.txt\',"w+", encoding="utf8") as params:\n#with open(output+\'/MLP_all_labels_first_params_max_features.txt\',"w+", encoding="utf8") as params:\n    params.write("First parameters for classification with MLP (all labels):")\n    params.write("\nprocessing_time: %s" % processing_time)\n    for key, value in clf_params.items():\n        params.write("\n%s: %s" % (key, value))\n    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_)    \n    params.write("\nprecision: %s" % precision)\n    params.write("\nrecall: %s" % recall)\n    params.write("\nf1-score: %s" % f1)'

In [20]:
# write parameters and scores to file

with open(output+'/MLP_all_labels_params.txt',"a", encoding="utf8") as params:
    params.write("\n*********************************************************************************************")
    params.write("\nParameters for classification with MLP (all labels):")
    params.write("\n*********************************************************************************************")
    params.write("\n%s" % text_clf.named_steps.vect)
    params.write("\n%s" % text_clf.named_steps.tfidf)
    params.write("\n%s" % text_clf.named_steps.clf)
    #for key, value in clf_params.items():
        #params.write("\n%s: %s" % (key, value))
    params.write("\nclasses: %s" % text_clf.named_steps.clf.n_outputs_)
    params.write("\nlayers: %s" % text_clf.named_steps.clf.n_layers_)
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_) 
    params.write("\nepochs: %s" % text_clf.named_steps.clf.n_iter_)
    params.write("\nprocessing time: %s" % processing_time)
    params.write("\nSCORES:")
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)
    params.write("\n")

In [23]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_all_labels_predictions.txt',"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (all labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')

PREDICTED:
('approches de corpus_t', 'archives_t', 'enquêtes_t', 'histoire et archéologie_d', 'histoire_t', 'sciences sociales interdisciplinaires_d')
TRUE:
['histoire et archéologie_d', 'sciences sociales interdisciplinaires_d', 'histoire_t', 'approches de corpus_t', 'enquêtes_t', 'archives_t']


Speicherung der vektrorisierten Daten

In [24]:
z_train = [e.replace('.txt', '') for e in z_train]
z_test = [e.replace('.txt', '') for e in z_test]
ident_train = [e.replace('_', '.hypotheses.org/') for e in z_train]
ident_test = [e.replace('_', '.hypotheses.org/') for e in z_test]

print(len(ident_train))
print(ident_train[0])

17109
nummer.hypotheses.org/212


In [25]:
# vectorize textdata
train_vect = vectorizer.transform(X_train)
train_tfidf = tfidf_transformer.transform(train_vect)
print(train_tfidf.shape)

test_vect = vectorizer.transform(X_test)
test_tfidf = tfidf_transformer.transform(test_vect)

(17109, 10000)


In [26]:
print(type(test_tfidf))
train_tfidf

<class 'scipy.sparse.csr.csr_matrix'>


<17109x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2388086 stored elements in Compressed Sparse Row format>

In [27]:
# save filename, classes, textvectors in csv file
# trainset
# speichert vektorisierten Text
output_file_train = 'Datasets/all_labels_train_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_train, train_tfidf)

# speichert filenames und classes
with open('../Datasets/all_labels_train_idents_labels.csv', 'w', newline='', encoding="utf-8") as traincsv:
    train = csv.writer(traincsv, delimiter = ";")
    train.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_train, y_train):
        labellist = ", ".join(labels)
        train.writerow([ident, labellist, output_file_train])

# testset
# speichert vektorisierten Text
output_file_test = 'Datasets/all_labels_test_scikit-learn_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_test, test_tfidf)

# speichert filenames und classes
with open('../Datasets/all_labels_test_idents_labels.csv', 'w', newline='', encoding="utf-8") as testcsv:
    test = csv.writer(testcsv, delimiter = ";")
    test.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_test, y_test):
        labellist = ", ".join(labels)
        test.writerow([ident, labellist, output_file_test])

Speicherung der korpusspezifischen Stoppwörter

In [28]:
# write corpus specific stopwords to file 

stopwords = text_clf.named_steps.vect.stop_words_
print(len(stopwords))
#print(stopwords)
with open('../Preprocessing/filtered_words_MLP.txt',"w+", encoding="utf8") as stops:
    for element in stopwords:
        stops.write(element)
        stops.write('\n')

269816


Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [29]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier(validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1)),
                ])

In [30]:
# parameter tuning with RandomSearch
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
rs_parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
                 #'vect__max_df' : (0.7, 0.8, 0.9), #1.0 
                 #'vect__min_df' : (0.01, 0.05, 0.1), #0.0
                 'vect__stop_words' : (stopwords, None),
                 'vect__max_features': (100000,50000,25000,10000,7500,5000,2500,1000,500,300,100),
                 'tfidf__use_idf': (True, False),
                 'clf__hidden_layer_sizes': ((4096,2048),(2048,1024),(1024,512),(512,256),(256,128),(4096,1024),(2048,512),(512,128))
                }

In [32]:
# train
rs_clf = RandomizedSearchCV(clf, rs_parameters, cv=3, n_iter=50, n_jobs=1, verbose=10, random_state=1)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] vect__stop_words=['und', 'die', 'der'], vect__ngram_range=(1, 1), vect__max_features=1000, tfidf__use_idf=True, clf__hidden_layer_sizes=(2048, 512) 


KeyboardInterrupt: 

In [None]:
best_score = rs_clf.best_score_
print(best_score)

In [None]:
best_params = rs_clf.best_params_
print(best_params)

In [None]:
rs_clf_params = rs_clf.get_params()
print(rs_clf_params)

In [None]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [None]:
# accuracy_score computes the accuracy of correct predictions
rs_accuracy_score = accuracy_score(encoded_y_test, rs_predicted)
print(rs_accuracy_score)

In [None]:
# precision is a measure of result relevancy
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

In [None]:
# recall is a measure of how many truly relevant results are returned
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

In [None]:
# F1 score is a weighted average of the precision and recall
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)
#49,46

In [None]:
print(classification_report(encoded_y_test, rs_predicted))

Ergebnisse in Dateien speichern

In [None]:
output = '../MLP'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [None]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/MLP_all_labels_rs_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with Multi-Layer-Perzeptron and vectorization in scikit-learn (all labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

In [None]:
# write parameters and scores to file

with open(output+'/MLP_all_labels_rs_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with Multi-Layer-Perceptron and vectorization in scikit-learn from randomized search (all labels):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    params.write("\nparams:")
    for key, value in rs_clf_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest params:")
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [None]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/MLP_all_labels_rs_results_%s.csv' % timestamp, encoding='utf-8')