# Textklassifikation mit Bag-of-Words-Vektorisierung in tf-idf-Repräsentation und kNN-Algorithmus 
Labels (Themen und Disziplinen) sind nicht reduziert (all_labels)

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neighbors import KNeighborsClassifier # k-nearest neighbors classifier (supports multi-label classification)
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/all_labels_trainset.csv' 
testset = '../Datasets/all_labels_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

nummer_212.txt
['histoire_d', "sciences de l'information et de la communication_d", 'bibliothéconomie_d', 'histoire_t', 'histoire intellectuelle_t', 'histoire et sociologie des médias_t', 'histoire culturelle_t']
die gemälde der habsburgischen sammlungen zu wien wurden von der stallburg ins belvedere transferiert und dort von christian von mechel neu angeordnet und aufgehängt


k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

114
0 1914-1918_t
1 1918-1939_t
2 1939-1945_t
3 1945-1989_t
4 administration publique et développement_d
5 anthropologie politique_t
6 approches de corpus_t
7 archives_t
8 archéologie_d
9 arts et humanités_d
10 arts_d
11 asie_t
12 bas moyen âge_t
13 bibliothéconomie_d
14 biomédecine_d
15 chine_t
16 communication_d
17 conflits_t
18 digital humanities_t
19 enquêtes_t
20 europe centrale et orientale_t
21 europe_t
22 france_t
23 guerres_t
24 haut moyen âge_t
25 histoire culturelle_t
26 histoire de l'art_t
27 histoire des religions_t
28 histoire des sciences sociales_d
29 histoire des sciences_t
30 histoire du droit_t
31 histoire et archéologie_d
32 histoire et philosophie des sciences_d
33 histoire et sociologie des médias_t
34 histoire industrielle_t
35 histoire intellectuelle_t
36 histoire politique_t
37 histoire sociale_t
38 histoire urbaine_t
39 histoire économique_t
40 histoire_d
41 histoire_t
42 historiographie_t
43 humanités pluridisciplinaires_d
44 information_t
45 langage_t
46 lan

Vektorisierung und Klassifikation der Daten mit scikit-learn

In [88]:
# best params from randomized search
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.0)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=6, weights='distance')),
                    ])


In [89]:
"""# best params from randomized search
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=100000)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=1, weights='distance')),
                    ])"""

"# best params from randomized search\ntext_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=100000)),\n                     ('tfidf', TfidfTransformer(use_idf=True)),\n                     ('clf', KNeighborsClassifier(n_neighbors=1, weights='distance')),\n                    ])"

In [90]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

In [91]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [92]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance'))], 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pat

In [93]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8106462929569521


  'precision', 'predicted', average, warn_for)


In [94]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.7169638008347685


In [95]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.7382758407814421


  'precision', 'predicted', average, warn_for)


In [96]:
# write corpus specific stopwords to file 

stopwords = text_clf.named_steps.vect.stop_words_
print(len(stopwords))
#print(stopwords)
with open('../Preprocessing/filtered_words.txt',"w+", encoding="utf8") as stops:
    for element in stopwords:
        stops.write(element)
        stops.write('\n')

3


In [87]:
"""# write corpus specific stopwords to file 

stopwords = text_clf.named_steps.vect.stop_words_
print(len(stopwords))
#print(stopwords)
with open('../Preprocessing/filtered_words_max_features.txt',"w+", encoding="utf8") as stops:
    for element in stopwords:
        stops.write(element)
        stops.write('\n')"""

179819


Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [6]:
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier()),
                ])

In [25]:
# parameter tuning with RandomSearch
stopwords = open('../Preprocessing/filtered_words.txt', 'r', encoding='utf-8').read().splitlines()
parameters = {'vect__ngram_range': [(1, 1), (1,2),(1,3),(1,4)], 
              'vect__max_df' : (0.7, 0.8, 0.9, 1.0), 
              'vect__min_df' : (0.0, 0.01, 0.05, 0.1),
              #'vect__stop_words' : (stopwords, None),
              #'vect__max_features': (100000,50000,25000,10000,7500,5000,2500,1000,500,300,100), #200000, 50
              'tfidf__use_idf': (True, False),
              'clf__n_neighbors': list(range(1,10,1)),
              'clf__weights' : ('distance', 'uniform')
             }

In [26]:
# train
rs_clf = RandomizedSearchCV(clf, parameters, n_jobs=1, verbose=10, random_state=1, return_train_score=True, cv=3, n_iter=50)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8, score=0.3266701735928459, total=  51.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8, score=0.3357881816587761, total=  51.4s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s


[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=25000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=8, score=0.1704366123093109, total=  51.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min remaining:    0.0s


[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.08995265649658074, total=  23.6s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.5min remaining:    0.0s


[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.1707873049272313, total=  25.5s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.3min remaining:    0.0s


[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=10000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.09468700683850605, total=  25.8s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.1min remaining:    0.0s



[CV]  vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.2765211292302297, total=  11.4s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  6.5min remaining:    0.0s


[CV]  vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.27406628090478696, total=  11.4s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.9min remaining:    0.0s


[CV]  vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5, score=0.26968262318078207, total=  10.4s
[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.3min remaining:    0.0s


[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9, score=0.3557776608802385, total=  25.2s
[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9, score=0.3624408206207259, total=  25.7s
[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=7500, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=9, score=0.30738207960722425, total=  25.3s
[CV] vect__stop_words=None, vect__ngram_range=(1, 1), vect__max_features=100, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=7 
[CV]  vect__stop_words=

[CV]  vect__stop_words=None, vect__ngram_range=(1, 4), vect__max_features=300, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=3, score=0.26354550236717517, total= 1.2min
[CV] vect__stop_words=None, vect__ngram_range=(1, 4), vect__max_features=300, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=3 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 4), vect__max_features=300, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=3, score=0.272312817815185, total= 1.1min
[CV] vect__stop_words=None, vect__ngram_range=(1, 4), vect__max_features=300, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=3 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 4), vect__max_features=300, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=3, score=0.2689812379449413, total= 1.2min
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 4), vect__max_features=2500, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=1 
[CV]  v

[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8, score=0.33210590917061195, total=  46.6s
[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8, score=0.3340347185691741, total= 1.7min
[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8, score=0.13904962300543575, total=  48.2s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 1), vect__max_features=5000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=5 
[

[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=1, score=0.2907241802560056, total=  26.5s
[CV] vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 2), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=1, score=0.3070313869893039, total=  48.8s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 2), vect__max_features=5000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8 
[CV]  vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 2), vect__max_features=5000, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=8, score=0.31386989303

[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.11853410485709276, total=  45.6s
[CV] vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__stop_words=None, vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.12449587936173943, total=  48.2s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=8 
[CV]  vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=8, score=0.23566543924250394, total=  46.0s
[CV] vect__stop_words=['die', 'und', 'der'], vect__ngram_range=(1, 3), vect__max_features=50000, tfidf__use_idf=False,

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 164.6min finished


In [28]:
best_score = rs_clf.best_score_
print(best_score)

0.4357940265357414


In [29]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__stop_words': None, 'vect__ngram_range': (1, 2), 'vect__max_features': 100000, 'tfidf__use_idf': True, 'clf__weights': 'distance', 'clf__n_neighbors': 1}


In [30]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [31]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

0.7074306984896046


In [32]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.7131792866891044


In [33]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)

0.7065274884297654


In [34]:
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, rs_predicted))

             precision    recall  f1-score   support

          0       0.92      0.87      0.89       674
          1       0.05      0.67      0.09        24
          2       0.10      0.64      0.17        58
          3       0.49      0.67      0.57        49
          4       0.70      0.64      0.67        25
          5       0.59      0.68      0.63        25
          6       0.85      0.70      0.77      1412
          7       0.85      0.70      0.77      1412
          8       0.65      0.67      0.66        45
          9       0.54      0.59      0.56       389
         10       0.85      0.63      0.72        27
         11       0.92      0.67      0.78        67
         12       0.45      0.59      0.51       212
         13       0.63      0.39      0.48        85
         14       0.88      0.67      0.76        42
         15       0.65      0.37      0.47        30
         16       0.56      0.54      0.55       170
         17       0.71      0.69      0.70   

Ergebnisse in Dateien speichern

In [25]:
output = '../kNN'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [26]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/Blogs_all_labels_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with k-nearest-neighbors (all labels):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('approches de corpus_t', 'archives_t', 'enquêtes_t', 'histoire et archéologie_d', 'histoire_t', 'sciences sociales interdisciplinaires_d')
TRUE:
['histoire et archéologie_d', 'sciences sociales interdisciplinaires_d', 'histoire_t', 'approches de corpus_t', 'enquêtes_t', 'archives_t']


In [27]:
# write parameters and scores to file

with open(output+'/kNN_all_labels_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with k-nearest-neighbors from randomized search (all labels):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [28]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/kNN_all_labels_rs_results_%s.csv' % timestamp, encoding='utf-8')

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        4.175122      0.127517         6.630511        0.161892   
1       53.124664      1.375119        13.902534        0.374614   
2       13.104774      0.184638         7.654144        0.056771   
3        3.570384      0.012411         5.656494        0.066434   
4       28.625589      0.128469         8.433493        0.106650   
5       46.419061      0.162230         8.640185        0.048638   
6        3.578746      0.099932         5.434251        0.077464   
7       13.146901      0.038702         7.575019        0.038552   
8       28.534684      0.476551         8.889240        0.049896   
9       45.575807      0.246583         8.605586        0.128716   
10      48.056591      0.169584        12.329216        0.235883   
11      44.700165      0.677451         9.297399        0.092742   
12      13.450645      0.158506         8.595414        0.083028   
13       3.661496      0.251958         5.614534