# Einlesen der Daten

In [1]:
import pandas as pd
from sklearn.utils import shuffle

df = pd.read_csv('tagged_dataset.csv', encoding='UTF-8').dropna()
df = df[~df['genre'].isin(['NEWS-P4'])]  # remove invalid genre (only one document)
df = shuffle(df, random_state=42)
df['n_chars'] = df.tokens.apply(lambda x: len(x.split())) 
df.shape

(629, 10)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,genre,lemmas,period,pos_tags,region,title,tokens,year,n_chars
500,500,NEWS,Aufforderung an die Herr Pfarrer und Schullehr...,P4,"NN APPR ART NN NN KON NN $. PPER $, PRELS ART ...",OOD,Badisches,Aufforderung an die Herrn Pfarrer und Schulleh...,1832,494
248,248,SERM,"so , mein Zuhörer , haben wir dies Augenblick ...",P5,"ADV $, PPOSAT NN $, VAFIN PPER PDAT NN APPR AR...",NoD,Sonntag,"So , meine Zuhörer , haben wir diesen Augenbli...",1861,2449
557,557,NEWS,Breslau von+die @card@ . Januar . gestern sein...,P4,NE APPRART CARD $. NN $. ADV VAFIN APPRART NN ...,OMD,Neue,Breslau vom 19 . Januar . Gestern war zur Feie...,1821,51
217,217,SERM,"eine höchst bedeutungsvoll Fest sein es , mein...",P5,"ART ADV ADJA NN VAFIN PPER $, PPOSAT NN $, PRE...",NoD,Gegenwärtige,"Ein höchst bedeutungsvolles Fest ist es , mein...",1853,2450
538,538,LEGA,die Polizei = Verordnung für Berlin . systemat...,P4,ART NN $( NN APPR NE $. ADJD VVPP APPR NE NE $...,NoD,DiePolizei=VerordnungfürBerlin,Die Polizei = Verordnungen für Berlin . System...,1850,2631


# Trennen von Trainings- und Testdaten

In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state=42, test_size=0.2)

# Feature Extraction

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from stop_words import get_stop_words

# tfidf values for tokens (n_grams)
cv_token = TfidfVectorizer(stop_words=get_stop_words('de'), ngram_range=(1, 5))
X_token_train = cv_token.fit_transform(df_train.tokens)
X_token_test = cv_token.transform(df_test.tokens)

# tfidf values for lemmas (n_grams)
cv_lemma = TfidfVectorizer(stop_words=get_stop_words('de'))
X_lemma_train = cv_lemma.fit_transform(df_train.lemmas)
X_lemma_test = cv_lemma.transform(df_test.lemmas)

# Only count pos tags
cv_pos = CountVectorizer()
X_pos_train = cv_pos.fit_transform(df_train.pos_tags)
X_pos_test = cv_pos.transform(df_test.pos_tags)


X_train = hstack([X_token_train, X_lemma_train, X_pos_train])
X_test = hstack([X_token_test, X_lemma_test, X_pos_test])

In [11]:
#train_lemma_pos = []
#for index, row in df_train.iterrows():
#    sample_lemma_pos = []
#    for lemma, pos in zip(row.tokens, row.pos_tags):
#        sample_lemma_pos.append("_".join((lemma, pos)))
#    train_lemma_pos.append(" ".join(sample_lemma_pos))
#
#test_lemma_pos = []
#for index, row in df_test.iterrows():
#    sample_lemma_pos = []
#    for lemma, pos in zip(row.tokens, row.pos_tags):
#        sample_lemma_pos.append("_".join((lemma, pos)))
#    test_lemma_pos.append(" ".join(sample_lemma_pos))
#
#merged_cv = TfidfVectorizer()
#X_train = merged_cv.fit_transform(train_lemma_pos)
#X_test = merged_cv.transform(test_lemma_pos)

In [12]:
y_train = df_train.genre
y_test = df_test.genre

In [13]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((503, 1493997), (503,), (126, 1493997), (126,))

In [14]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.00      0.00      0.00         9
        HUMA       0.00      0.00      0.00        18
        LEGA       0.36      0.62      0.45        16
        NARR       0.09      1.00      0.17         7
        NEWS       1.00      0.13      0.23        53
        SCIE       0.00      0.00      0.00        13
        SERM       0.50      0.70      0.58        10

    accuracy                           0.25       126
   macro avg       0.28      0.35      0.21       126
weighted avg       0.51      0.25      0.21       126



  'precision', 'predicted', average, warn_for)


In [15]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        DRAM       1.00      1.00      1.00         9
        HUMA       0.67      0.44      0.53        18
        LEGA       0.92      0.75      0.83        16
        NARR       0.67      0.86      0.75         7
        NEWS       0.89      0.96      0.93        53
        SCIE       0.60      0.69      0.64        13
        SERM       0.91      1.00      0.95        10

    accuracy                           0.83       126
   macro avg       0.81      0.82      0.80       126
weighted avg       0.83      0.83      0.83       126



In [None]:
from sklearn.model_selection import GridSearchCV

linsvm_params = {
    'C': [0.1, 0.5, 1, 1.5, 2, 3, 4]
}

gridsearch_linsvm = GridSearchCV(
    LinearSVC(),
    cv=5,
    param_grid=linsvm_params,
    scoring='f1_macro',
    n_jobs=-1
)

gridsearch_linsvm.fit(X_train, y_train)
gridsearch_linsvm.best_params_, gridsearch_linsvm.best_score_

In [16]:
from sklearn.svm import LinearSVC

linsvm = LinearSVC()
linsvm.fit(X_train, y_train)
y_pred = linsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       1.00      0.89      0.94         9
        HUMA       1.00      0.06      0.11        18
        LEGA       1.00      0.50      0.67        16
        NARR       0.33      0.86      0.48         7
        NEWS       0.68      0.98      0.80        53
        SCIE       0.67      0.15      0.25        13
        SERM       0.82      0.90      0.86        10

    accuracy                           0.68       126
   macro avg       0.78      0.62      0.59       126
weighted avg       0.78      0.68      0.62       126





In [None]:
from sklearn.svm import SVC

rbfsvm = SVC(kernel='poly')
rbfsvm.fit(X_train, y_train)
y_pred = rbfsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [17]:
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()
dectree.fit(X_train, y_train)
y_pred = dectree.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       1.00      0.89      0.94         9
        HUMA       0.82      0.50      0.62        18
        LEGA       0.93      0.81      0.87        16
        NARR       0.40      0.57      0.47         7
        NEWS       0.90      0.85      0.87        53
        SCIE       0.42      0.77      0.54        13
        SERM       0.78      0.70      0.74        10

    accuracy                           0.76       126
   macro avg       0.75      0.73      0.72       126
weighted avg       0.81      0.76      0.77       126



In [18]:
from sklearn.ensemble import RandomForestClassifier

randforest = RandomForestClassifier()
randforest.fit(X_train, y_train)
y_pred = randforest.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        DRAM       0.64      0.78      0.70         9
        HUMA       0.86      0.33      0.48        18
        LEGA       0.85      0.69      0.76        16
        NARR       0.50      0.43      0.46         7
        NEWS       0.74      0.98      0.85        53
        SCIE       0.62      0.62      0.62        13
        SERM       0.83      0.50      0.62        10

    accuracy                           0.73       126
   macro avg       0.72      0.62      0.64       126
weighted avg       0.75      0.73      0.71       126



In [36]:
from sklearn.linear_model import SGDClassifier

sgdsvm = SGDClassifier(loss='modified_huber', max_iter=15, random_state=42)

sgdsvm.fit(X_train, y_train)
y_pred = sgdsvm.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.80      0.89      0.84         9
        HUMA       0.56      0.28      0.37        18
        LEGA       0.71      0.75      0.73        16
        NARR       0.50      0.43      0.46         7
        NEWS       0.87      0.75      0.81        53
        SCIE       0.46      0.92      0.62        13
        SERM       0.75      0.90      0.82        10

    accuracy                           0.71       126
   macro avg       0.66      0.70      0.66       126
weighted avg       0.73      0.71      0.70       126



In [40]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

adaboost = AdaBoostClassifier(
    #base_estimator=SGDClassifier(loss='modified_huber', max_iter=5, random_state=42),
    #algorithm='SAMME.R',
    #n_estimators=100
)

adaboost.fit(X_train, y_train)
y_pred = adaboost.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.00      0.00      0.00         9
        HUMA       0.00      0.00      0.00        18
        LEGA       0.28      1.00      0.44        16
        NARR       0.14      0.43      0.21         7
        NEWS       0.94      0.83      0.88        53
        SCIE       0.00      0.00      0.00        13
        SERM       0.00      0.00      0.00        10

    accuracy                           0.50       126
   macro avg       0.19      0.32      0.22       126
weighted avg       0.44      0.50      0.44       126



  'precision', 'predicted', average, warn_for)


In [20]:
from xgboost.sklearn import XGBClassifier

grad_boost = XGBClassifier()
grad_boost.fit(X_train, y_train)
y_pred = grad_boost.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost_sklean = GradientBoostingClassifier()
grad_boost_sklean.fit(X_train, y_train)
y_pred = grad_boost_sklean.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=20000)
tok.fit_on_texts(df_train.text)

Xk_train = tok.texts_to_matrix(df_train.text)
Xk_test = tok.texts_to_matrix(df_test.text)

In [None]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
yk_train = le.fit_transform(y_train)
yk_test = le.transform(y_test)

yk_train = to_categorical(yk_train)

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Reshape

def build_model(num_words, n_classes, hiddenlayer_size=512, n_hiddenlayer=1):
    model = Sequential()
    model.add(Dense(hiddenlayer_size, input_shape=(num_words, ), activation='relu'))
    model.add(Dropout(0.5))
    for i in range(n_hiddenlayer):
        model.add(Dense(hiddenlayer_size, activation='relu'))
        model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
import numpy as np

model = build_model(20000,
                    len(np.unique(y_train)),
                    n_hiddenlayer=3,
                    hiddenlayer_size=1024)

In [None]:
history = model.fit(Xk_train, yk_train,
                    batch_size=128,
                    validation_split=0.1,
                    epochs=5)

In [None]:
yk_pred = model.predict_classes(X_test)
print(classification_report(yk_test, yk_pred))

In [None]:
le.classes_

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


NUM_WORDS = 5000
MAX_SEQ_LEN = 3000
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df_train.text)

train_sequences = tokenizer.texts_to_sequences(df_train.text)
test_sequences = tokenizer.texts_to_sequences(df_test.text)

train_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQ_LEN)
test_sequences = pad_sequences(test_sequences, maxlen=MAX_SEQ_LEN)

In [None]:
train_sequences.shape

In [None]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from tqdm import tqdm_notebook

def create_embedding_matrix(sequences, tokenizer: Tokenizer):
    X = []
    embedder = WordEmbeddings('de')
    for sequence in train_sequences:
        text = []
        for entry in sequence:
            text.append(tokenizer.index_word.get(entry, 'UNKOWN'))
        text_mat = []
        print(text[:4])
        flair_data = Sentence(" ".join(text))
        embedder.embed(flair_data)
        for token in flair_data:
            text_mat.append(token.embedding.cpu().detach().numpy())
        X.append(text_mat)
    return X

In [None]:
df_train.iloc[0].text

In [None]:
X_train[0]

In [59]:
from keras.models import Model
from keras.layers import *

def build_multiinput_model(embedding_dim, pos_input_shape, char_input_shape, num_classes):
    
    """
    Conceptional draft
    """
    
    # 1. Input 
    embedding_input = Input(shape=(embedding_dim,))
    emebedding_layer = Embedding(input_dim=embedding_dim, output_dim=100)(embedding_input)
    embedding_conv_dropout = SpatialDropout1D(0.5)
    embedding_conv = Conv1D(filters=128, kernel_size=(5,))(emebedding_layer)
    
    # 2. Input pos 
    pos_input = Input(shape=pos_input_shape)
    pos_dense = Dense(512)(pos_input)
    pos_dropout = Dropout(0.5)(pos_dense)
    
    # 3.Input char
    char_input = Input(shape=char_input_shape)
    char_embedding = Embedding(input_dim=embedding_dim, output_dim=100)(char_input) # meh
    char_conv_dropout = SpatialDropout1D(0.5)
    char_conv = Conv1D(filters=128, kernel_size=(5,))(char_embedding)

    # 3. Concat input the three input layers
    concat_layer = Concatenate()([embedding_conv, char_conv])
    bi_lstm = Bidirectional(LSTM(16, return_sequences=True))(concat_layer)
    flatten_layer = Flatten()(bi_lstm)
    hidden_dense = Dense(512, activation='relu')(flatten_layer)
    outpout_layer = Dense(num_classes, activation='softmax')
    model = Model(inputs=[embedding_input, pos_input, char_input], outpouts=[outpout_layer])
    mode.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [60]:
model = build_multiinput_model(300, (5000,), (100,), 10)

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 296, 128), (None, 96, 128)]