# Textklassifikation mit Vektorisierung in gensim und MLPClassifier 
Labels (Theme und Disziplinen) sind nicht reduziert (all_labels)

Autorin: Maria Hartmann

In [23]:
# Imports
import os
import csv
import time
import numpy as np
import pandas as pd
import scipy.sparse
import multiprocessing # module for multiprocessing 
from sklearn.base import BaseEstimator
import gensim # module for Doc2Vec
from gensim.models.doc2vec import Doc2Vec
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim.models import KeyedVectors
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.sklearn_api import D2VTransformer
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neural_network import MLPClassifier # MultiLayerPerceptron classifier 
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/all_labels_trainset.csv' 
testset = '../Datasets/all_labels_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

# Splitten der Texte in Wörter
X_train = [e.split(' ') for e in X_train]
X_test = [e.split(' ') for e in X_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

nummer_212.txt
['histoire_d', "sciences de l'information et de la communication_d", 'bibliothéconomie_d', 'histoire_t', 'histoire intellectuelle_t', 'histoire et sociologie des médias_t', 'histoire culturelle_t']
['die', 'gemälde', 'der', 'habsburgischen', 'sammlungen', 'zu', 'wien', 'wurden', 'von', 'der', 'stallburg', 'ins', 'belvedere', 'transferiert', 'und', 'dort', 'von', 'christian', 'von', 'mechel', 'neu', 'angeordnet', 'und', 'aufgehängt']


Stoppwortfilterung

In [4]:
def remove_stopwords(X_train):
    #stopwords = open('../Preprocessing/filtered_words_MLP.txt', 'r', encoding='utf-8').read().splitlines()
    stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
    #stopwords = open('../Preprocessing/german_stopwords_plain.txt', 'r', encoding='utf-8').read().splitlines()
    clean_textlist = []
    for text in X_train:
        clean_text = []
        for word in text:
            if word in stopwords:
            #if word in stopwords[9:]: #Die ersten Zeilen enthalten eine Beschreibung
                continue
            else:
                clean_text.append(word)
        clean_textlist.append(clean_text)
    #print(clean_textlist)
    return clean_textlist

X_train = remove_stopwords(X_train)

k-hot-Kodierung der Labels

In [5]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [6]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

114
0 1914-1918_t
1 1918-1939_t
2 1939-1945_t
3 1945-1989_t
4 administration publique et développement_d
5 anthropologie politique_t
6 approches de corpus_t
7 archives_t
8 archéologie_d
9 arts et humanités_d
10 arts_d
11 asie_t
12 bas moyen âge_t
13 bibliothéconomie_d
14 biomédecine_d
15 chine_t
16 communication_d
17 conflits_t
18 digital humanities_t
19 enquêtes_t
20 europe centrale et orientale_t
21 europe_t
22 france_t
23 guerres_t
24 haut moyen âge_t
25 histoire culturelle_t
26 histoire de l'art_t
27 histoire des religions_t
28 histoire des sciences sociales_d
29 histoire des sciences_t
30 histoire du droit_t
31 histoire et archéologie_d
32 histoire et philosophie des sciences_d
33 histoire et sociologie des médias_t
34 histoire industrielle_t
35 histoire intellectuelle_t
36 histoire politique_t
37 histoire sociale_t
38 histoire urbaine_t
39 histoire économique_t
40 histoire_d
41 histoire_t
42 historiographie_t
43 humanités pluridisciplinaires_d
44 information_t
45 langage_t
46 lan

Klassifikation der Daten mit gensim

In [40]:
vectorizer = D2VTransformer(dm=0, window=10, iter=20, size=100, min_count=4, sample=0)

In [41]:
text_clf = Pipeline([('vect', vectorizer),
                     ('clf', MLPClassifier(hidden_layer_sizes=(4096,1024), validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1))
                    ])

In [42]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60



Iteration 1, loss = 15.39062418
Validation score: 0.386908
Iteration 2, loss = 9.15380830
Validation score: 0.437171
Iteration 3, loss = 7.78852290
Validation score: 0.459965
Iteration 4, loss = 6.81823120
Validation score: 0.496201
Iteration 5, loss = 6.05225084
Validation score: 0.501461
Iteration 6, loss = 5.35493116
Validation score: 0.509059
Iteration 7, loss = 4.69862046
Validation score: 0.541204
Iteration 8, loss = 4.13840602
Validation score: 0.541788
Iteration 9, loss = 3.56103418
Validation score: 0.531853
Iteration 10, loss = 3.06816381
Validation score: 0.551140
Iteration 11, loss = 2.57990222
Validation score: 0.542373
Iteration 12, loss = 2.18571266
Validation score: 0.555231
Iteration 13, loss = 1.83785760
Validation score: 0.560491
Iteration 14, loss = 1.57701640
Validation score: 0.549386
Iteration 15, loss = 1.30962439
Validation score: 0.556400
Iteration 16, loss = 1.14745803
Validation score: 0.555815
Validation score did not improve more than tol=0.000100 for two 

In [43]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words=0, dm=0, dm_concat=0, dm_mean=None, dm_tag_count=1,
        docvecs=None, docvecs_mapfile=None,
        hashfxn=<built-in function hash>, hs=0, iter=20,
        max_vocab_size=None, min_alpha=0.0001, min_count=4, negative=5,
        sample=0, seed=1, size=100, sorted_vocab=1, trim_rule=None,
        window=10, workers=3)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(4096, 1024), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False))], 'vect': D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
        dbow_words

In [44]:
print(processing_time)

9.418157331148784


In [45]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [46]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8217658127545926


  'precision', 'predicted', average, warn_for)


In [47]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.7562095863463324


In [48]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.7706134706298624


  'precision', 'predicted', average, warn_for)


In [49]:
output = '../MLP/gensim_klein'
if not os.path.exists(output):
    os.makedirs(output)

In [50]:
# write parameters and scores to file

with open(output+'/MLP_gensim_all_labels_params.txt',"a", encoding="utf8") as params:
    params.write("\n*********************************************************************************************")
    params.write("\nParameters for classification with MLP and vectorization in gensim (all labels):")
    params.write("\n*********************************************************************************************")
    params.write("\n%s" % text_clf.named_steps.vect)
    params.write("\n%s" % text_clf.named_steps.clf)
    #for key, value in clf_params.items():
        #params.write("\n%s: %s" % (key, value))
    params.write("\nclasses: %s" % text_clf.named_steps.clf.n_outputs_)
    params.write("\nlayers: %s" % text_clf.named_steps.clf.n_layers_)
    params.write("\nactivation function output layer: %s" % text_clf.named_steps.clf.out_activation_) 
    params.write("\nepochs: %s" % text_clf.named_steps.clf.n_iter_)
    params.write("\nprocessing time: %s" % processing_time)
    params.write("\nSCORES:")
    params.write("\nprecision: %s" % precision)
    params.write("\nrecall: %s" % recall)
    params.write("\nf1-score: %s" % f1)
    params.write("\n")

Speicherung der vektrorisierten Daten

In [51]:
z_train = [e.replace('.txt', '') for e in z_train]
z_test = [e.replace('.txt', '') for e in z_test]
ident_train = [e.replace('_', '.hypotheses.org/') for e in z_train]
ident_test = [e.replace('_', '.hypotheses.org/') for e in z_test]

print(len(ident_train))
print(ident_train[0])

17109
nummer.hypotheses.org/212


In [52]:
# vectorize textdata
train_vect = vectorizer.transform(X_train)
test_vect = vectorizer.transform(X_test)

print(train_vect.shape)
print(type(train_vect))

(17109, 100)
<class 'numpy.ndarray'>


In [53]:
# convert vectorized textdata to sparse matrix
train_matrix = sparse.csr_matrix(train_vect)
test_matrix = sparse.csr_matrix(test_vect)

train_matrix

<17109x100 sparse matrix of type '<class 'numpy.float32'>'
	with 1710900 stored elements in Compressed Sparse Row format>

In [54]:
# save filename, classes, textvectors in csv file
# trainset
# speichert vektorisierten Text
output_file_train = 'Datasets/all_labels_train_gensim_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_train, train_matrix)

# speichert filenames und classes
with open('../Datasets/all_labels_gensim_train_idents_labels.csv', 'w', newline='', encoding="utf-8") as traincsv:
    train = csv.writer(traincsv, delimiter = ";")
    train.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_train, y_train):
        labellist = ", ".join(labels)
        train.writerow([ident, labellist, output_file_train])

# testset
# speichert vektorisierten Text
output_file_test = 'Datasets/all_labels_test_gensim_sparse_matrix.npz'
scipy.sparse.save_npz('../'+output_file_test, test_matrix)

# speichert filenames und classes
with open('../Datasets/all_labels_gensim_test_idents_labels.csv', 'w', newline='', encoding="utf-8") as testcsv:
    test = csv.writer(testcsv, delimiter = ";")
    test.writerow(["url", "classes", "filename"])
    
    for ident, labels in zip(ident_test, y_test):
        labellist = ", ".join(labels)
        test.writerow([ident, labellist, output_file_test])