# Einlesen der Daten

In [1]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv('tagged_dataset.csv', encoding='UTF-8').dropna()
df = df[~df['genre'].isin(['NEWS-P4'])]  # remove invalid genre (only one document)
df = shuffle(df, random_state=42)
#df['n_chars'] = df.tokens.apply(lambda x: len(x.split())) 
df.shape

(629, 9)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,genre,lemmas,period,pos_tags,region,title,tokens,year
500,500,NEWS,Aufforderung an die Herr Pfarrer und Schullehr...,P4,"NN APPR ART NN NN KON NN $. PPER $, PRELS ART ...",OOD,Badisches,Aufforderung an die Herrn Pfarrer und Schulleh...,1832
248,248,SERM,"so , mein Zuhörer , haben wir dies Augenblick ...",P5,"ADV $, PPOSAT NN $, VAFIN PPER PDAT NN APPR AR...",NoD,Sonntag,"So , meine Zuhörer , haben wir diesen Augenbli...",1861
557,557,NEWS,Breslau von+die @card@ . Januar . gestern sein...,P4,NE APPRART CARD $. NN $. ADV VAFIN APPRART NN ...,OMD,Neue,Breslau vom 19 . Januar . Gestern war zur Feie...,1821
217,217,SERM,"eine höchst bedeutungsvoll Fest sein es , mein...",P5,"ART ADV ADJA NN VAFIN PPER $, PPOSAT NN $, PRE...",NoD,Gegenwärtige,"Ein höchst bedeutungsvolles Fest ist es , mein...",1853
538,538,LEGA,die Polizei = Verordnung für Berlin . systemat...,P4,ART NN $( NN APPR NE $. ADJD VVPP APPR NE NE $...,NoD,DiePolizei=VerordnungfürBerlin,Die Polizei = Verordnungen für Berlin . System...,1850


# Trennen von Trainings- und Testdaten

In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state=42, test_size=0.2)

# Feature Extraction

In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from stop_words import get_stop_words

# tfidf values for tokens (n_grams)
cv_token = TfidfVectorizer(stop_words=get_stop_words('de'), max_features=100)
X_token_train = cv_token.fit_transform(df_train.tokens)
X_token_test = cv_token.transform(df_test.tokens)

# tfidf values for lemmas (n_grams)
cv_lemma = TfidfVectorizer(max_features=6000)#stop_words=get_stop_words('de'),  
X_lemma_train = cv_lemma.fit_transform(df_train.lemmas)
X_lemma_test = cv_lemma.transform(df_test.lemmas)

# Only count pos tags
cv_pos = CountVectorizer()
X_pos_train = cv_pos.fit_transform(df_train.pos_tags)
X_pos_test = cv_pos.transform(df_test.pos_tags)

# Concatenate features horizontally
features = np.hstack([
    np.array(list(cv_token.vocabulary_.keys())),
    np.array(list(cv_lemma.vocabulary_.keys())),
    np.array(list(cv_pos.vocabulary_.keys()))
])

X_train = hstack([
    X_token_train,
    X_lemma_train,
    X_pos_train
])
X_test = hstack([
    X_token_test,
    X_lemma_test,
    X_pos_test
])

In [5]:
#train_lemma_pos = []
#for index, row in df_train.iterrows():
#    sample_lemma_pos = []
#    for lemma, pos in zip(row.tokens, row.pos_tags):
#        sample_lemma_pos.append("_".join((lemma, pos)))
#    train_lemma_pos.append(" ".join(sample_lemma_pos))
#
#test_lemma_pos = []
#for index, row in df_test.iterrows():
#    sample_lemma_pos = []
#    for lemma, pos in zip(row.tokens, row.pos_tags):
#        sample_lemma_pos.append("_".join((lemma, pos)))
#    test_lemma_pos.append(" ".join(sample_lemma_pos))
#
#merged_cv = TfidfVectorizer()
#X_train = merged_cv.fit_transform(train_lemma_pos)
#X_test = merged_cv.transform(test_lemma_pos)

In [6]:
y_train = df_train.genre
y_test = df_test.genre

In [7]:
features.shape

(6150,)

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((503, 6150), (503,), (126, 6150), (126,))

In [9]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       1.00      1.00      1.00         9
        HUMA       0.50      0.11      0.18        18
        LEGA       0.65      0.69      0.67        16
        NARR       0.50      0.86      0.63         7
        NEWS       0.86      0.70      0.77        53
        SCIE       0.38      0.85      0.52        13
        SERM       0.83      1.00      0.91        10

    accuracy                           0.68       126
   macro avg       0.67      0.74      0.67       126
weighted avg       0.72      0.68      0.67       126



In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        DRAM       1.00      1.00      1.00         9
        HUMA       0.64      0.39      0.48        18
        LEGA       0.86      0.75      0.80        16
        NARR       0.67      0.86      0.75         7
        NEWS       0.89      0.96      0.93        53
        SCIE       0.60      0.69      0.64        13
        SERM       0.91      1.00      0.95        10

    accuracy                           0.83       126
   macro avg       0.79      0.81      0.79       126
weighted avg       0.82      0.83      0.82       126



# Check feature weights of logreg model to get highly correlated features for each class

In [91]:
n = 10
for class_ind, class_coef in enumerate(logreg.coef_):
    print('Class:', logreg.classes_[class_ind])
    print('Positive features')
    positive_features = np.flip(np.argsort(class_coef)[-n:])
    for ind in positive_features:
        print(features[ind], class_coef[ind])
    print()
    print('Negative features')
    negative_features = np.argsort(class_coef)[:n]
    for ind in negative_features:
        print(features[ind], class_coef[ind])
    print()
    print('#'*40)

Class: DRAM
Positive features
deutschen 2.906600954330881
gibt 1.8768686249087378
mutter 1.5450681232860786
viele 1.510233465633998
wegen 1.3856997124137145
seit 1.2923487252888761
sei 1.244900592196551
sehen 1.150804129548595
bald 1.0005089768352733
fu 0.8786443948747108

Negative features
jahre -1.6894873237345962
alte -1.129915568158518
große -1.123023466867393
seyn -0.8919906474551318
herr -0.8850382269963954
schon -0.8816901786342032
einzelnen -0.8718448481385498
wohl -0.8279963415790786
zwei -0.8129910947501675
tag -0.7934239046229704

########################################
Class: HUMA
Positive features
gott 1.8795639257345078
herrn 1.678565881622185
wasser 1.2952886717343095
wer 1.0981184654156566
wort 1.0935045748553873
ja 1.0627937415708977
hand 0.9342812250228008
tag 0.8374884853422764
ganz 0.8169209603975234
weiß 0.8065272083605995

Negative features
geist -1.4121976069818587
muß -1.137299419198156
tage -1.0824044196211868
stehen -0.7754124243062941
weit -0.740319003172470

In [None]:
from sklearn.model_selection import GridSearchCV

linsvm_params = {
    'C': [0.1, 0.5, 1, 1.5, 2, 3, 4]
}

gridsearch_linsvm = GridSearchCV(
    LinearSVC(),
    cv=5,
    param_grid=linsvm_params,
    scoring='f1_macro',
    n_jobs=-1
)

gridsearch_linsvm.fit(X_train, y_train)
gridsearch_linsvm.best_params_, gridsearch_linsvm.best_score_

In [41]:
from sklearn.svm import LinearSVC

linsvm = LinearSVC()
linsvm.fit(X_train, y_train)
y_pred = linsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.90      1.00      0.95         9
        HUMA       0.75      0.50      0.60        18
        LEGA       1.00      0.50      0.67        16
        NARR       0.40      0.86      0.55         7
        NEWS       0.74      1.00      0.85        53
        SCIE       1.00      0.08      0.14        13
        SERM       0.88      0.70      0.78        10

    accuracy                           0.74       126
   macro avg       0.81      0.66      0.65       126
weighted avg       0.80      0.74      0.70       126





In [None]:
from sklearn.svm import SVC

rbfsvm = SVC(kernel='poly')
rbfsvm.fit(X_train, y_train)
y_pred = rbfsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [42]:
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()
dectree.fit(X_train, y_train)
y_pred = dectree.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       1.00      0.89      0.94         9
        HUMA       0.71      0.56      0.63        18
        LEGA       0.76      0.81      0.79        16
        NARR       0.50      0.71      0.59         7
        NEWS       0.94      0.85      0.89        53
        SCIE       0.45      0.69      0.55        13
        SERM       0.89      0.80      0.84        10

    accuracy                           0.78       126
   macro avg       0.75      0.76      0.75       126
weighted avg       0.81      0.78      0.79       126



In [43]:
from sklearn.ensemble import RandomForestClassifier

randforest = RandomForestClassifier()
randforest.fit(X_train, y_train)
y_pred = randforest.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.88      0.78      0.82         9
        HUMA       0.54      0.39      0.45        18
        LEGA       0.87      0.81      0.84        16
        NARR       0.40      0.29      0.33         7
        NEWS       0.89      0.91      0.90        53
        SCIE       0.60      0.69      0.64        13
        SERM       0.62      1.00      0.77        10

    accuracy                           0.76       126
   macro avg       0.68      0.69      0.68       126
weighted avg       0.76      0.76      0.75       126





In [44]:
from sklearn.linear_model import SGDClassifier

sgdsvm = SGDClassifier(loss='modified_huber', max_iter=15, random_state=42)

sgdsvm.fit(X_train, y_train)
y_pred = sgdsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.80      0.89      0.84         9
        HUMA       0.56      0.28      0.37        18
        LEGA       0.71      0.75      0.73        16
        NARR       0.50      0.43      0.46         7
        NEWS       0.87      0.75      0.81        53
        SCIE       0.46      0.92      0.62        13
        SERM       0.75      0.90      0.82        10

    accuracy                           0.71       126
   macro avg       0.66      0.70      0.66       126
weighted avg       0.73      0.71      0.70       126





In [40]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

adaboost = AdaBoostClassifier(
    #base_estimator=SGDClassifier(loss='modified_huber', max_iter=5, random_state=42),
    #algorithm='SAMME.R',
    #n_estimators=100
)

adaboost.fit(X_train, y_train)
y_pred = adaboost.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.00      0.00      0.00         9
        HUMA       0.00      0.00      0.00        18
        LEGA       0.28      1.00      0.44        16
        NARR       0.14      0.43      0.21         7
        NEWS       0.94      0.83      0.88        53
        SCIE       0.00      0.00      0.00        13
        SERM       0.00      0.00      0.00        10

    accuracy                           0.50       126
   macro avg       0.19      0.32      0.22       126
weighted avg       0.44      0.50      0.44       126



  'precision', 'predicted', average, warn_for)


In [20]:
from xgboost.sklearn import XGBClassifier

grad_boost = XGBClassifier()
grad_boost.fit(X_train, y_train)
y_pred = grad_boost.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

# Boostrap Validation

In [39]:
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.base import BaseEstimator
from scipy.sparse import csr_matrix

def bootstrap_validation(clf1: BaseEstimator, clf2: BaseEstimator,
                         X: csr_matrix, y: csr_matrix,
                         n_samples: int,
                         sample_size: int,
                         scorer: callable = lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro')) -> float:
    
    y_pred_1 = clf1.predict(X)
    clf1_score = scorer(y, y_pred_1)
    
    y_pred_2 = clf2.predict(X)
    clf2_score = scorer(y, y_pred_2)
    
    initial_difference = clf1_score - clf2_score
    
    sample_differences = []
    for _ in range(n_samples):
        
        # create boostrap sample
        X_sample, y_sample = resample(X, y, replace=True, n_samples=sample_size)
        # calculate performance difference and store it
        y_pred_1 = clf1.predict(X_sample)
        clf1_score = scorer(y_sample, y_pred_1)
    
        y_pred_2 = clf2.predict(X_sample)
        clf2_score = scorer(y_sample, y_pred_2)
        sample_differences.append(clf1_score - clf2_score)
        
    
    # calculate p value based on performance differences
    s_values = []
    s = 0
    for sample_difference in sample_differences:
        if sample_difference > 2 * initial_difference:
            s += 1
    p_value = s / n_samples
    return s, p_value



In [40]:
from sklearn.linear_model import SGDClassifier

# train simple sgd with hinge loss => linear support vector machine
# train simple sgd with log loss => logistic regression

sgd_hinge = SGDClassifier(loss='hinge')
sgd_hinge.fit(X_train, y_train)

sgd_log = SGDClassifier(loss='log')
sgd_log.fit(X_train, y_train)
bootstrap_validation(sgd_hinge, sgd_log, X_test, y_test, 50, 10)

0.4533333333333333 0.35333333333333333
0.5 0.480952380952381
0.71 0.5416666666666666
0.3928571428571429 0.2704761904761905
0.3444444444444444 0.3444444444444444
0.6011904761904762 0.7166666666666667
0.4 0.4333333333333333
0.373015873015873 0.25142857142857145
0.798095238095238 0.9314285714285713
0.38888888888888884 0.5111111111111111
0.3888888888888889 0.5222222222222223
0.7083333333333334 0.6416666666666666
0.3333333333333333 0.3642857142857143
0.4375 0.4375
0.44761904761904764 0.3722222222222222
0.75 0.6666666666666666
0.3333333333333333 0.39285714285714285
0.6507936507936508 0.6507936507936508
0.3095238095238095 0.5238095238095238
0.74 0.8266666666666665
0.3333333333333333 0.2333333333333333
0.5599999999999999 0.6476190476190476
0.6714285714285715 0.6166666666666666
0.44761904761904764 0.6517857142857143
0.42777777777777776 0.31666666666666665
0.45999999999999996 0.6266666666666667
0.5666666666666667 0.5777777777777778
0.4740740740740741 0.41190476190476194
0.71 0.8314285714285713
0

(15, 0.3)

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost_sklean = GradientBoostingClassifier()
grad_boost_sklean.fit(X_train, y_train)
y_pred = grad_boost_sklean.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.88      0.78      0.82         9
        HUMA       0.93      0.78      0.85        18
        LEGA       0.94      1.00      0.97        16
        NARR       0.67      0.86      0.75         7
        NEWS       0.93      0.96      0.94        53
        SCIE       0.92      0.92      0.92        13
        SERM       1.00      0.90      0.95        10

    accuracy                           0.91       126
   macro avg       0.90      0.89      0.89       126
weighted avg       0.92      0.91      0.91       126



In [None]:
from keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=20000)
tok.fit_on_texts(df_train.text)

Xk_train = tok.texts_to_matrix(df_train.text)
Xk_test = tok.texts_to_matrix(df_test.text)

In [None]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
yk_train = le.fit_transform(y_train)
yk_test = le.transform(y_test)

yk_train = to_categorical(yk_train)

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Reshape

def build_model(num_words, n_classes, hiddenlayer_size=512, n_hiddenlayer=1):
    model = Sequential()
    model.add(Dense(hiddenlayer_size, input_shape=(num_words, ), activation='relu'))
    model.add(Dropout(0.5))
    for i in range(n_hiddenlayer):
        model.add(Dense(hiddenlayer_size, activation='relu'))
        model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
import numpy as np

model = build_model(20000,
                    len(np.unique(y_train)),
                    n_hiddenlayer=3,
                    hiddenlayer_size=1024)

In [None]:
history = model.fit(Xk_train, yk_train,
                    batch_size=128,
                    validation_split=0.1,
                    epochs=5)

In [None]:
yk_pred = model.predict_classes(X_test)
print(classification_report(yk_test, yk_pred))

In [None]:
le.classes_

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


NUM_WORDS = 5000
MAX_SEQ_LEN = 3000
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df_train.text)

train_sequences = tokenizer.texts_to_sequences(df_train.text)
test_sequences = tokenizer.texts_to_sequences(df_test.text)

train_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQ_LEN)
test_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQ_LEN)

In [None]:
train_sequences.shape

In [None]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from tqdm import tqdm_notebook

def create_embedding_matrix(sequences, tokenizer: Tokenizer):
    X = []
    embedder = WordEmbeddings('de')
    for sequence in train_sequences:
        text = []
        for entry in sequence:
            text.append(tokenizer.index_word.get(entry, 'UNKOWN'))
        text_mat = []
        print(text[:4])
        flair_data = Sentence(" ".join(text))
        embedder.embed(flair_data)
        for token in flair_data:
            text_mat.append(token.embedding.cpu().detach().numpy())
        X.append(text_mat)
    return X

In [None]:
df_train.iloc[0].text

In [None]:
X_train[0]

In [59]:
from keras.models import Model
from keras.layers import *

def build_multiinput_model(embedding_dim, pos_input_shape, char_input_shape, num_classes):
    
    """
    Conceptional draft
    """
    
    # 1. Input 
    embedding_input = Input(shape=(embedding_dim,))
    emebedding_layer = Embedding(input_dim=embedding_dim, output_dim=100)(embedding_input)
    embedding_conv_dropout = SpatialDropout1D(0.5)
    embedding_conv = Conv1D(filters=128, kernel_size=(5,))(emebedding_layer)
    
    # 2. Input pos 
    pos_input = Input(shape=pos_input_shape)
    pos_dense = Dense(512)(pos_input)
    pos_dropout = Dropout(0.5)(pos_dense)
    
    # 3.Input char
    char_input = Input(shape=char_input_shape)
    char_embedding = Embedding(input_dim=embedding_dim, output_dim=100)(char_input) # meh
    char_conv_dropout = SpatialDropout1D(0.5)
    char_conv = Conv1D(filters=128, kernel_size=(5,))(char_embedding)

    # 3. Concat input the three input layers
    concat_layer = Concatenate()([embedding_conv, char_conv])
    bi_lstm = Bidirectional(LSTM(16, return_sequences=True))(concat_layer)
    flatten_layer = Flatten()(bi_lstm)
    hidden_dense = Dense(512, activation='relu')(flatten_layer)
    outpout_layer = Dense(num_classes, activation='softmax')
    model = Model(inputs=[embedding_input, pos_input, char_input], outpouts=[outpout_layer])
    mode.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [60]:
model = build_multiinput_model(300, (5000,), (100,), 10)

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 296, 128), (None, 96, 128)]