# Textklassifikation mit Bag-of-Words-Vektorisierung in tf-idf-Repräsentation und kNN-Algorithmus 
Labels (Themen und Disziplinen) sind auf höchste Hierarchieebene reduziert

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neighbors import KNeighborsClassifier # k-nearest neighbors classifier (supports multi-label classification)
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/reduced_labels_trainset.csv' 
testset = '../Datasets/reduced_labels_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

archivalia_575.txt
['pluridisciplinarité_d', 'épistémologie et méthodes_t', 'histoire_t', 'histoire et archéologie_d']
diese titelformulierung der hab in ihrer handschriftendatenbank ist besonders sinnreich http diglib hab de db mss list ms id aug f mit digitalisat ich möchte nicht wissen wieviele forscher sich auf die fehlanzeige des verlinkten opacs der grundsätzlich nichts zu den handschriften ausspuckt während die ältere dokumentation funktioniert verlassen und so unnötig rechercheaufwand betreiben müssen kürzt man die signatur findet man den hinweis auf die münchner schedel ausstellung welt des wissens bevor ich das tat erkannte ich hartmann schedels ziemlich unverwechselbare schriftzüge und bemerkte bei einem blick auf die münchner digitalisate den typischen signaturzettel auf dem titel das bemerkenswerte familienbuch der nürnberger familie grabner erscheint in der liste von schedels büchern bei stauber https archive org stream dieschedelscheb hartgoog page n mode up die handschr

k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

[0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

35
0 administration publique et développement_d
1 anthropologie_t
2 arts et humanités_d
3 asie_t
4 bibliothéconomie_d
5 droit_t
6 ethnologie_t
7 europe_t
8 géographie_t
9 histoire et archéologie_d
10 histoire_t
11 information_t
12 langage_t
13 langue et linguistique_d
14 littérature_d
15 moyen âge_t
16 pensée_t
17 pluridisciplinarité_d
18 psychisme_t
19 psychologie_d
20 religions_t
21 représentations_t
22 sciences de l'information et de la communication_d
23 sciences de la santé et de la santé publique_d
24 sciences politiques_d
25 sociologie et anthropologie_d
26 sociologie_t
27 travail social et politique sociale_d
28 éducation_d
29 éducation_t
30 épistémologie et méthodes_t
31 époque contemporaine_t
32 époque moderne_t
33 études des sciences_t
34 études du politique_t


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [14]:
# best params from randomized search
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,2), max_df=0.9, min_df=0.01)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=6, weights='distance')),
                   ])

In [15]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

In [16]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [17]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance'))], 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_p

In [18]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.6687134063536588


  'precision', 'predicted', average, warn_for)


In [19]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.4638039211321119


In [20]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.5214151310224241


  'precision', 'predicted', average, warn_for)


In [21]:
# write corpus specific stopwords to file 

stopwords = text_clf.named_steps.vect.stop_words_
print(len(stopwords))
#print(stopwords)
with open('../Preprocessing/filtered_words_reduced_labels.txt',"w+", encoding="utf8") as stops:
    for element in stopwords:
        stops.write(element)
        stops.write('\n')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [6]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier())
               ])

In [7]:
# parameter tuning with RandomSearch
parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4),(2,2),(3,3)], 
              'vect__max_df' : (0.8, 0.9, 1.0), 
              'vect__min_df' : (0.0, 0.01, 0.05,),
              'tfidf__use_idf': (True, False),
              'clf__n_neighbors': list(range(1,10,1)),
              'clf__weights' : ('distance', 'uniform')
             }

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
clf = Pipeline([#('vect', CountVectorizer()),
                ('tfidf', TfidfVectorizer()),
                ('clf', KNeighborsClassifier()),
                ])

# parameter tuning with RandomSearch
parameters = {'tfidf__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
              'tfidf__max_df' : (0.7, 0.8, 0.9, 1.0), 
              'tfidf__min_df' : (0.0, 0.01, 0.05, 0.1),
              'tfidf__use_idf': (True, False),
              'clf__n_neighbors': list(range(1,10,1)),
              'clf__weights' : ('distance', 'uniform')
             }

In [9]:
# train
rs_clf = RandomizedSearchCV(clf, parameters, n_jobs=1, verbose=1, random_state=1, return_train_score=True, cv=3, n_iter=50)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

In [17]:
best_score = rs_clf.best_score_
print(best_score)

0.6405985154012508


In [18]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 1), 'vect__min_df': 0.0, 'vect__max_df': 0.8, 'tfidf__use_idf': True, 'clf__weights': 'uniform', 'clf__n_neighbors': 1}


In [19]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [20]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)
#0.8579

0.7985712245003971


In [21]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.7983755945887783


In [22]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)

0.7925753029890477


Ergebnisse in Dateien speichern

In [26]:
output = '../kNN'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [27]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/kNN_reduced_labels_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with k-nearest-neighbors (reduced labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')

PREDICTED:
('histoire et archéologie_d', 'histoire_t', 'épistémologie et méthodes_t')
TRUE:
['pluridisciplinarité_d', 'épistémologie et méthodes_t', 'histoire_t', 'histoire et archéologie_d']


In [28]:
# write parameters and scores to file

with open(output+'/kNN_reduced_labels_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with k-nearest-neighbors from randomized search (reduced labels):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [29]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/kNN_reduced_labels_rs_results_%s.csv' % timestamp, encoding='utf-8')

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        3.557203      0.092697         5.869590        0.052982   
1       82.193439     21.199769        18.953067        7.823778   
2       25.521886      9.640995        13.179605        4.641404   
3        3.394821      0.173413         5.469966        0.147285   
4       32.706812      3.828990        12.022873        5.707268   
5       55.633518     15.551100         8.519456        0.251476   
6        3.315675      0.034034         5.075876        0.042897   
7       13.086049      0.210817         7.391932        0.068449   
8       29.587317      0.899688         8.617346        0.293419   
9       52.819775      4.772485        10.803697        1.691523   
10      64.224142     13.353591        18.167801        8.400733   
11      90.067466     30.682572        15.291221        5.749664   
12      13.986063      0.204836         8.659735        0.147837   
13       3.481476      0.060530         5.613819