# Textklassifikation mit Bag-of-Words-Vektorisierung in tf-idf-Repräsentation und kNN-Algorithmus 
Labels sind auf die Disziplinen reduziert

Autorin: Maria Hartmann

In [2]:
# Imports
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer # module to one-hot-encode the labels
from sklearn.pipeline import Pipeline # assemples transormers 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # module to transform a count matrix to a normalized tf-idf representation
from sklearn.neighbors import KNeighborsClassifier # k-nearest neighbors classifier (supports multi-label classification)
from sklearn.model_selection import RandomizedSearchCV # module for paramter optimization

np.random.seed(7) # fix random seed for reproducibility

Einlesen des Trainings- und Testdatensatzes

In [4]:
trainset = '../Datasets/disciplines_only_trainset.csv' 
testset = '../Datasets/disciplines_only_testset.csv' 

trainset_csv = pd.read_csv(trainset, delimiter=';')
X_train = trainset_csv['text'].values
y_train = trainset_csv['classes'].values
z_train = trainset_csv['filename'].values

testset_csv = pd.read_csv(testset, delimiter=';')
X_test = testset_csv['text'].values
y_test = testset_csv['classes'].values
z_test = testset_csv['filename'].values

# Splitten der Labels pro Blogbeitrag
y_train = [e.split(', ') for e in y_train]
y_test = [e.split(', ') for e in y_test]

In [5]:
print(z_train[0])
print(y_train[0])
print(X_train[0])

archivewk1_2023.txt
['histoire et archéologie_d']
stadtarchiv goch bestand völcker janssen niederrheinisches volksblatt gocher zeitung vom lokalteil foto lazarett wilhelm anton hospital nachdem ein weiterer verwundetentransport eingetroffen ist werden verwundete im wilhelm anton hospital gepflegt ein neuer verwundetentransport ist gestern abend wieder hier angekommen damit ist die zahl der verwundeten die im wilhelm anton hospital aufnahme gefunden haben auf gestiegen mit der einrichtung des van gulikschen hauses in der brückenstraße für die aufnahme von verwundeten ist begonnen worden und dürfte im laufe der nächsten woche als lazarett fertig eingerichtet werden


k-hot-Kodierung der Labels

In [6]:
# k-hot-encode labels mit MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

[0 0 0 1 0 0 0 0 0 0 0 0 0 0]


In [7]:
print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

14
0 administration publique et développement_d
1 arts et humanités_d
2 bibliothéconomie_d
3 histoire et archéologie_d
4 langue et linguistique_d
5 littérature_d
6 pluridisciplinarité_d
7 psychologie_d
8 sciences de l'information et de la communication_d
9 sciences de la santé et de la santé publique_d
10 sciences politiques_d
11 sociologie et anthropologie_d
12 travail social et politique sociale_d
13 éducation_d


Vektorisierung und Klassifikation der Daten mit scikit-learn

In [22]:
# best params from randomized search
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,4), max_df=0.9, min_df=0.0)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=6, weights='distance')),
                   ])

In [23]:
# train
start = time.time()
text_clf = text_clf.fit(X_train, encoded_y_train)
processing_time = (time.time() - start) / 60

In [24]:
# predict
predicted = text_clf.predict(X_test)
#predicted_proba = text_clf.predict_proba(X_test)

In [25]:
clf_params = text_clf.get_params()
print(clf_params)

{'memory': None, 'steps': [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance'))], 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.0,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pat

In [26]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
precision = precision_score(encoded_y_test, predicted, average='samples')
print(precision)

0.8646960118629518


In [27]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
recall = recall_score(encoded_y_test, predicted, average='samples')  
print(recall)

0.8334152813548739


In [28]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
f1 = f1_score(encoded_y_test, predicted, average='samples') 
print(f1)

0.8354697424119106


Parameteroptimierung mit Rastersuche (RandomizedSearch)

In [8]:
clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                    ])

In [9]:
# parameter tuning with RandomSearch
parameters = {'vect__ngram_range': [(1, 1), (1,2),(1,3),(1,4)], 
              'vect__max_df' : (0.7, 0.8, 0.9, 1.0), 
              'vect__min_df' : (0.0, 0.01, 0.05, 0.1),
              'tfidf__use_idf': (True, False),
              'clf__n_neighbors': list(range(1,10,1)),
              'clf__weights' : ('distance', 'uniform')
             }

In [10]:
# train
rs_clf = RandomizedSearchCV(clf, parameters, n_jobs=1, verbose=10, random_state=1, return_train_score=True, cv=3, n_iter=50)
start = time.time()
rs_clf = rs_clf.fit(X_train, encoded_y_train)
rs_processing_time = (time.time() - start) / 60

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.42096944151738674, total=  14.6s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.6s remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.40909889337783245, total=  15.9s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=5, score=0.4215703495520815, total=  15.8s
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.7014401123990165, total= 2.3min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.7min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.6919023362023538, total= 2.1min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.2min remaining:    0.0s


[CV]  vect__ngram_range=(1, 4), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=6, score=0.6857544352713859, total= 2.2min
[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 12.8min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.35493501931858096, total=  36.3s


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 14.0min remaining:    0.0s


[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.3953978570173898, total=  38.6s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 15.2min remaining:    0.0s


[CV] vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.29878798524503775, total=  37.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.3min remaining:    0.0s


[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.43888303477344576, total=  16.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.4415949411558054, total=  15.0s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=1.0, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.45459336026699454, total=  15.2s
[CV] vect__ngram_range=(1, 3), vect__min_df=0.1, vect__max_df=1.0, tfidf__use_idf=T

[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.46505093080435544, total=  41.3s
[CV] vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.45986298963639555, total=  42.6s
[CV] vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 2), vect__min_df=0.0, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.49235903741436854, total=  41.7s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=uniform, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, 

[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.41921320688443975, total= 1.7min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.4064640786931319, total= 1.7min
[CV] vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.1, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1, score=0.42034076936588793, total= 1.7min
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=4 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=Fals

[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.2156656129258869, total= 1.1min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.30440892323906554, total= 1.1min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=7, score=0.26295450553311084, total= 1.1min
[CV] vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=1 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.05, vect__max_df=0.9, tfidf__use_idf

[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.47506146821215317, total=  17.3s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.4563499033901282, total=  16.8s
[CV] vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 1), vect__min_df=0.0, vect__max_df=0.9, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=9, score=0.47988758124011943, total=  16.8s
[CV] vect__ngram_range=(1, 4), vect__min_df=0.05, vect__max_df=0.8, tfidf__use_idf=True, clf__weights=distance, clf__n_neighbors=9 
[CV]  vect__ngram_range=(1, 4), vect__min_df=0.05, vect__max_df=0.8, tfidf__use_idf

[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.3744292237442922, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.36184788336553664, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.0, vect__max_df=1.0, tfidf__use_idf=False, clf__weights=uniform, clf__n_neighbors=5, score=0.38433163534164766, total= 1.3min
[CV] vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=False, clf__weights=distance, clf__n_neighbors=3 
[CV]  vect__ngram_range=(1, 3), vect__min_df=0.01, vect__max_df=0.8, tfidf__use_idf=Fal

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 252.8min finished


In [11]:
best_score = rs_clf.best_score_
print(best_score)

0.6930327868852459


In [12]:
best_params = rs_clf.best_params_
print(best_params)

{'vect__ngram_range': (1, 4), 'vect__min_df': 0.0, 'vect__max_df': 0.9, 'tfidf__use_idf': True, 'clf__weights': 'distance', 'clf__n_neighbors': 6}


In [13]:
# predict 
rs_predicted = rs_clf.predict(X_test)
#print(predicted)

In [14]:
# precision is a measure of result relevancy
from sklearn.metrics import precision_score
rs_precision = precision_score(encoded_y_test, rs_predicted, average='samples')
print(rs_precision)
#0.9991 best_params

0.8646960118629518


In [15]:
# recall is a measure of how many truly relevant results are returned
from sklearn.metrics import recall_score
rs_recall = recall_score(encoded_y_test, rs_predicted, average='samples')  
print(rs_recall)
#0.9984 best_params

0.8334152813548739


In [16]:
# F1 score is a weighted average of the precision and recall
from sklearn.metrics import f1_score
rs_f1 = f1_score(encoded_y_test, rs_predicted, average='samples') 
print(rs_f1)
#0.9988 best_params

0.8354697424119106


Ergebnisse in Dateien speichern

In [17]:
output = '../kNN'
if not os.path.exists(output):
    os.makedirs(output)
    
timestamp = time.strftime('%Y-%m-%d_%H.%M')

In [18]:
# write real labels and predictions to file

inverse_prediction = label_encoder.inverse_transform(rs_predicted)
print('PREDICTED:')
print(inverse_prediction[0])
print('TRUE:')
print(y_test[0])

with open(output+'/kNN_disciplines_only_predictions_%s.txt' % timestamp,"w+", encoding="utf8") as preds:
    preds.write("Predictions from classification with k-nearest-neighbors (disciplines only):\n\n")
    for ident, label, pred in zip(z_test, y_test, inverse_prediction):
        label = sorted(label)
        pred = sorted(pred)
        preds.write(ident)
        preds.write('\n')
        preds.write('TRUE: ')
        for element in label:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('PRED: ')
        for element in pred:
            preds.write('%s, ' % element)
        preds.write('\n')
        preds.write('\n*********************\n')

PREDICTED:
('histoire et archéologie_d', 'pluridisciplinarité_d')
TRUE:
['pluridisciplinarité_d', 'histoire et archéologie_d']


In [21]:
# write parameters and scores to file

with open(output+'/kNN_disciplines_only_params_%s.txt' % timestamp,"w+", encoding="utf8") as params:
    params.write("Parameters for classification with k-nearest-neighbors from randomized search (disciplines only):")
    params.write("\nprocessing_time: %s" % rs_processing_time)
    for key, value in best_params.items():
        params.write("\n%s: %s" % (key, value))
    params.write("\nbest_score: %s" % best_score)
    params.write("\nprecision: %s" % rs_precision)
    params.write("\nrecall: %s" % rs_recall)
    params.write("\nf1-score: %s" % rs_f1)

In [20]:
results = rs_clf.cv_results_
df = pd.DataFrame(data=results)
print(df)
df.to_csv(output+'/kNN_disciplines_only_rs_results_%s.csv' % timestamp, encoding='utf-8')

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        5.832048      0.192454         9.706409        0.406635   
1      101.823498      0.785690        31.289585        3.289126   
2       24.537419      0.617027        13.039641        0.317276   
3        5.775571      0.090621         9.812640        0.405096   
4       54.060066      0.887256        14.669857        1.426723   
5       86.546893      0.559607        14.265831        0.061782   
6        5.798417      0.099002         8.409352        0.089288   
7       26.095290      1.206969        13.118725        0.695301   
8       55.302185      1.736518        14.591272        0.418776   
9       89.113790      3.427033        14.378681        1.075087   
10      91.629176      1.059045        21.800640        0.192640   
11      86.021698      0.833466        15.356696        0.286809   
12      26.001849      0.337593        15.951108        0.685936   
13       5.894198      0.309806         9.177021