# Textklassifikation mit Bag-of-Words-Vektorisierung in tf-idf-Repräsentation und kNN-Algorithmus 
Labels sind auf die Themen reduziert

Autorin: Maria Hartmann

In [1]:
# Imports
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neighbors import KNeighborsClassifier # k-nearest neighbors classifier (supports multi-label classification)
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [2]:
trainset = '../Datasets/themes_only_trainset.csv' 
testset = '../Datasets/themes_only_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [3]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

ordensgeschichte_6160.txt
['histoire_t', 'époque moderne_t', 'moyen âge_t']
das colloquium historicum wirsbergense e v chw lädt für freitag und samstag oktober zu einem wissenschaftlichen symposium in den kastenhof weismain lkr lichtenfels ein im mittelpunkt der tagung steht mauritius knauer der vor jahren in der bambergischen stadt weismain geboren wurde er trat in die zisterzienserabtei langheim ein nachdem er während des dreißigjährigen kriegs mehrere jahre in heiligenkreuz und wien verbracht hatte wurde er prior von langheim trat er als abt an die spitze des klosters er starb knauer war universalgelehrter er kämpfte für die rechte seines klosters besaß medizinische kenntnisse schrieb theologische werke und belebte kraftvoll die wirtschaft langheims bis heute ist er als verfasser des hundertjährigen kalenders weithin berühmt bei der tagung sollen leben und wirken knauers in größere zusammenhänge eingeordnet werden programm freitag oktober uhr prof dr günter dippold lichtenfels einfü

k-hot-Kodierung der Labels

In [4]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

[0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0]


In [5]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

21
0 anthropologie_t
1 asie_t
2 droit_t
3 ethnologie_t
4 europe_t
5 géographie_t
6 histoire_t
7 information_t
8 langage_t
9 moyen âge_t
10 pensée_t
11 psychisme_t
12 religions_t
13 représentations_t
14 sociologie_t
15 éducation_t
16 épistémologie et méthodes_t
17 époque contemporaine_t
18 époque moderne_t
19 études des sciences_t
20 études du politique_t


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [19]:
# best params from randomized search
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.0)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=6, weights='distance')),
                   ])

In [20]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

In [21]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [22]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance'))], 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pat

In [23]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8472300140252454


In [24]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.7957612591553685


In [25]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.8063441973329772


Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [6]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                    ])

In [7]:
# parameter tuning with RandomSearch
parameters = {'vect__ngram_range': [(1,1),(1,2),(1,3),(1,4)], 
              'vect__max_df' : (0.7, 0.8, 0.9, 1.0), 
              'vect__min_df' : (0.0, 0.01, 0.05, 0.1),
              'tfidf__use_idf': (True, False),
              'clf__n_neighbors': list(range(1,10,1)),
              'clf__weights' : ('distance', 'uniform')
             }

In [8]:
# train
rs_clf = RandomizedSearchCV(clf, parameters, n_jobs=1, verbose=10, random_state=1, return_train_score=True, cv=3, n_iter=50)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.3208837453971594, total=  15.7s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.3s remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.3259687883570051, total=  14.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.31772751183587583, total=  14.9s
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.6344029458179905, total= 2.1min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.9min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.6442223391197616, total= 2.1min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.2min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.6308960196387866, total= 2.1min
[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.5min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.2716114325793442, total=  36.4s


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 12.7min remaining:    0.0s


[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.25670699631772753, total=  36.0s
[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 13.7min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.2780992460108715, total=  35.7s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.7min remaining:    0.0s


[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.3741890233210591, total=  15.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.3841837629317903, total=  14.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.3743643696300193, total=  14.8s
[CV] vect__ngram_range=(1, 3), vect__min_df=0.1, vect__max_df=1.0, tfidf__use_idf=Tru

[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.36647378572681044, total=  44.9s
[CV] vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.3652463615640891, total=  42.0s
[CV] vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.33684025951253727, total=  42.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, c

[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.3750657548658601, total= 1.7min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.36191478169384533, total= 1.7min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.3645449763282483, total= 1.7min
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=4 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False

[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.2814308258811152, total= 1.2min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.26968262318078207, total= 1.2min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.14518674381904262, total= 1.1min
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf

[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.3405225320007014, total=  17.6s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.34192530247238295, total=  17.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.33368402595125374, total=  17.3s
[CV] vect__ngram_range=(1, 4), vect__min_df=0.05, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.05, vect__max_df=0.8, tfidf__use_idf

[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.21760476941960372, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.22181308083464843, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.18762055058741015, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=3 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=Fa

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 360.0min finished


In [9]:
best_score = rs_clf.best_score_
print(best_score)

0.6365071015255129


In [10]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 4), 'vect__min_df': 0.0, 'vect__max_df': 0.9, 'tfidf__use_idf': True, 'clf__weights': 'distance', 'clf__n_neighbors': 6}


In [11]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [12]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)

0.8472300140252454


In [13]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)

0.7957612591553685


In [14]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)

0.8063441973329772


Ergebnisse in Dateien speichern

In [15]:
output = '../kNN'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [16]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/kNN_themes_only_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with k-nearest-neighbors (themes only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')
    

PREDICTED:
('europe_t', 'histoire_t', 'information_t', 'études du politique_t')
TRUE:
['époque contemporaine_t', 'histoire_t']


In [17]:
# write parameters and scores to file

with open(output+'/kNN_themes_only_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with k-nearest-neighbors from randomized search (themes only):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [18]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/kNN_themes_only_rs_results_%s.csv' % timestamp, encoding='utf-8')

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        5.937869      0.256111         9.315847        0.174959   
1       99.661865      0.096567        24.448782        0.572094   
2       23.684389      0.346529        12.450089        0.102243   
3        5.731281      0.018306         9.265559        0.074607   
4       53.257790      0.129040        13.532091        0.103403   
5       87.081455      0.400836        14.249159        0.134989   
6        7.081272      1.216908         9.227212        0.598827   
7       23.583461      0.083341        12.321155        0.064735   
8       53.314050      0.136759        14.185892        0.040481   
9       87.125706      0.165135        13.859358        0.082812   
10      93.143182      0.516101        22.061245        0.115885   
11      92.090188      1.482404        16.383680        0.475747   
12      27.309221      1.246412        15.783903        0.137735   
13       5.906338      0.083003         8.927844