# Notebook for LSTM - Model Testing

see "RNN all data" notebook

contains similar test, only with the "vector_all_data_lemma_300d-3-5.tsv"-matrix


In [None]:
# imports
import csv
import numpy as np
import pandas as pd
import import_ipynb
import spacy
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nlp = spacy.load(
    "de_core_news_lg", exclude=["tok2vec", "ner", "parser", "attribute_ruler"]
)
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import make_sampling_table, pad_sequences
from tensorflow.keras import Model, Sequential, Input
from tensorflow.keras.layers import (
    Dot,
    Embedding,
    Flatten,
    Dense,
    GlobalAveragePooling1D,
    LSTM,
    concatenate,
    Dropout,
    Bidirectional,
)
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
#data
data = pd.DataFrame(columns=['tweet', 'party'])

afd = pd.read_csv('../cleaned-data/AfD.csv', quoting=csv.QUOTE_NONE)['text']
afd = pd.DataFrame([[i, 0] for i in afd], columns=['tweet', 'party'])

data = data.append(afd, ignore_index=True)

cdu = pd.read_csv('../cleaned-data/CDU.csv',quoting=csv.QUOTE_NONE)['text']
csu = pd.read_csv('../cleaned-data/CSU.csv',quoting=csv.QUOTE_NONE)['text']

cdu = pd.DataFrame([[i, 1] for i in cdu], columns=['tweet', 'party'])
csu = pd.DataFrame([[i, 1] for i in csu], columns=['tweet', 'party'])

data = data.append(cdu, ignore_index=True)
data = data.append(csu, ignore_index=True)

fdp = pd.read_csv('../cleaned-data/FDP.csv',quoting=csv.QUOTE_NONE)['text']
fdp = pd.DataFrame([[i, 2] for i in fdp], columns=['tweet', 'party'])

data = data.append(fdp, ignore_index=True)

gru = pd.read_csv('../cleaned-data/GRÜNE.csv',quoting=csv.QUOTE_NONE)['text']
gru = pd.DataFrame([[i, 3] for i in gru], columns=['tweet', 'party'])

data = data.append(gru, ignore_index=True)

lin = pd.read_csv('../cleaned-data/LINKE.csv',quoting=csv.QUOTE_NONE)['text']
lin = pd.DataFrame([[i, 4] for i in lin], columns=['tweet', 'party'])

data = data.append(lin, ignore_index=True)

spd = pd.read_csv('../cleaned-data/SPD.csv',quoting=csv.QUOTE_NONE)['text']
spd = pd.DataFrame([[i, 5] for i in spd], columns=['tweet', 'party'])

data = data.append(spd, ignore_index=True)
data = data.dropna()

In [None]:
# loading vocab and matrix
words = []
em_matrix = np.genfromtxt(fname = "../word_embedding/embeddings/vector_all_data_lemma_300d-3-5.tsv", delimiter = "\t" )       
with open("../vocab/all_lemma_vocab_token.csv", mode='r') as infile:
    reader = csv.reader(infile)
    next(reader) #skip header
    vocab = {rows[0]:int(rows[2]) for rows in reader}
# adding unknown-token for new words
vocab['UNK'] = len(vocab)

In [None]:
# transforming data
# 1.) lemmatizing
# 2.) padding to length 50
data['vectors'] = data['tweet'].apply([lambda x: [vocab[y.lemma_] if y.lemma_ in vocab else vocab['UNK'] for y in nlp(str(x))]])
data['vectors'] = data['vectors'].apply(lambda x: pad_sequences([x], maxlen=50, dtype=int, padding='post',value=0)[-1])
data['len'] = data['vectors'].apply(lambda x: len(x))
data_len = data['len'].max()
data

In [None]:
# preparing label: transforming integer to vector
y = tf.keras.utils.to_categorical(data['party'].to_numpy())
y
# preparing data (dataframe to np.array) 
X = np.array([np.array(x) for x in data['vectors']])
X

In [None]:
# splitting into train/test/validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.4)

In [None]:
# Deep learning model 1 (see RNN_all_data notebook)
class RnnModel():

    def __init__(self, embedding_matrix, embedding_dim, max_len):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Bidirectional(LSTM(64))(x)
        x = Dense(128, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(6, activation="softmax")(x)    
        model = Model(inputs=inp1, outputs=x)
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=["accuracy"])
        self.model = model


In [None]:
# Deep learning model 2 (see RNN_all_data notebook)
class RnnModel2():

    def __init__(self, embedding_matrix, embedding_dim, max_len):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(256, return_sequences=True))(x)
        x = Bidirectional(LSTM(128))(x)
        x = Dense(256, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(6, activation="softmax")(x)    
        model = Model(inputs=inp1, outputs=x)
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=["accuracy"])
        self.model = model

In [None]:
# test 1 for lemma matrix
m2 = RnnModel2(em_matrix, 300, 50)
h2 = m2.model.fit(X_train, y_train, epochs = 5, batch_size = 512, verbose = 1, validation_data=(X_val, y_val))

In [None]:
# visualizing test 1
fig, axes = plt.subplots(1, 2, figsize=(16,7))
# Block to evaluate training data
yhat_test = m2.model.predict(X_test)
yhat_test = np.argmax(yhat_test, axis=1)

y_label_test = np.argmax(y_test, axis=1)

mat = confusion_matrix(y_label_test, yhat_test)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[0]).set_title('Test Data');


# Block to evaluate test data
yhat_train = m2.model.predict(X_train)
yhat_train = np.argmax(yhat_train, axis=1)

y_label_train = np.argmax(y_train, axis=1)

mat = confusion_matrix(y_label_train, yhat_train)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[1]).set_title('Train Data');
plt.show()
print('Party: \t Test \t Train\nAfd:\t',sum(y_label_test == 0), "\t" , sum(y_label_train == 0))
print('Union:\t',sum(y_label_test == 1), "\t" , sum(y_label_train == 1))
print('FDP:\t',sum(y_label_test == 2), "\t" , sum(y_label_train == 2))
print('Grüne:\t',sum(y_label_test == 3), "\t" , sum(y_label_train == 3))
print('Linke:\t',sum(y_label_test == 4), "\t" , sum(y_label_train == 4))
print('SPD:\t',sum(y_label_test == 5), "\t" , sum(y_label_train == 5))
print('\nAcc:\t', "{:2.2f}%".format(accuracy_score(y_label_test,yhat_test)*100), "{:2.2f}%".format(accuracy_score(y_label_train,yhat_train)*100))

In [None]:
# Deep learning model 4 (see RNN_all_data notebook)
class RnnModel4():

    def __init__(self, embedding_matrix, embedding_dim, max_len):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Bidirectional(LSTM(64))(x)
        x = Dense(128, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(128, activation="relu")(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(6, activation="softmax")(x)    
        model = Model(inputs=inp1, outputs=x)
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=["accuracy"])
        self.model = model

In [None]:
# visualizing test 2
m5 = RnnModel4(em_matrix, 300, 50)
h5 = m5.model.fit(X_train, y_train, epochs = 5, batch_size = 512, verbose = 1, validation_data=(X_val, y_val))

In [None]:
# visualizing test 2
fig, axes = plt.subplots(1, 2, figsize=(16,7))
# Block to evaluate training data
yhat_test = m5.model.predict(X_test)
yhat_test = np.argmax(yhat_test, axis=1)

y_label_test = np.argmax(y_test, axis=1)

mat = confusion_matrix(y_label_test, yhat_test)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[0]).set_title('Test Data');


# Block to evaluate test data
yhat_train = m5.model.predict(X_train)
yhat_train = np.argmax(yhat_train, axis=1)

y_label_train = np.argmax(y_train, axis=1)

mat = confusion_matrix(y_label_train, yhat_train)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[1]).set_title('Train Data');
plt.show()
print('Party: \t Test \t Train\nAfd:\t',sum(y_label_test == 0), "\t" , sum(y_label_train == 0))
print('Union:\t',sum(y_label_test == 1), "\t" , sum(y_label_train == 1))
print('FDP:\t',sum(y_label_test == 2), "\t" , sum(y_label_train == 2))
print('Grüne:\t',sum(y_label_test == 3), "\t" , sum(y_label_train == 3))
print('Linke:\t',sum(y_label_test == 4), "\t" , sum(y_label_train == 4))
print('SPD:\t',sum(y_label_test == 5), "\t" , sum(y_label_train == 5))
print('\nAcc:\t', "{:2.2f}%".format(accuracy_score(y_label_test,yhat_test)*100), "{:2.2f}%".format(accuracy_score(y_label_train,yhat_train)*100))

In [None]:
# Deep learning model 3 (see RNN_all_data notebook)
class RnnModel5():

    def __init__(self, embedding_matrix, embedding_dim, max_len):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(512, return_sequences=True))(x)
        x = Bidirectional(LSTM(256))(x)
        x = Dense(256, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(256, activation="relu")(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(6, activation="softmax")(x)    
        model = Model(inputs=inp1, outputs=x)
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=["accuracy"])
        self.model = model

In [None]:
# test 3 for lemma matrix
m6 = RnnModel5(em_matrix, 300, 50)
h6 = m6.model.fit(X_train, y_train, epochs = 5, batch_size = 512, verbose = 1, validation_data=(X_val, y_val))

In [None]:
# visualizing test 3
fig, axes = plt.subplots(1, 2, figsize=(16,7))
# Block to evaluate training data
yhat_test = m6.model.predict(X_test)
yhat_test = np.argmax(yhat_test, axis=1)

y_label_test = np.argmax(y_test, axis=1)

mat = confusion_matrix(y_label_test, yhat_test)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[0]).set_title('Test Data');


# Block to evaluate test data
yhat_train = m6.model.predict(X_train)
yhat_train = np.argmax(yhat_train, axis=1)

y_label_train = np.argmax(y_train, axis=1)

mat = confusion_matrix(y_label_train, yhat_train)
df = pd.DataFrame(mat, index = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"],
                  columns = ["AfD", "Union", "FDP", "Grüne", "Linke", "SPD"])

sn.heatmap(df, annot=True ,cmap='Blues', fmt='g', ax=axes[1]).set_title('Train Data');
plt.show()
print('Party: \t Test \t Train\nAfd:\t',sum(y_label_test == 0), "\t" , sum(y_label_train == 0))
print('Union:\t',sum(y_label_test == 1), "\t" , sum(y_label_train == 1))
print('FDP:\t',sum(y_label_test == 2), "\t" , sum(y_label_train == 2))
print('Grüne:\t',sum(y_label_test == 3), "\t" , sum(y_label_train == 3))
print('Linke:\t',sum(y_label_test == 4), "\t" , sum(y_label_train == 4))
print('SPD:\t',sum(y_label_test == 5), "\t" , sum(y_label_train == 5))
print('\nAcc:\t', "{:2.2f}%".format(accuracy_score(y_label_test,yhat_test)*100), "{:2.2f}%".format(accuracy_score(y_label_train,yhat_train)*100))

In [None]:
# export test 1
S = "lemma_{mod}_ep{epo}_acc{acc}_valacc{valacc}".format(
        mod='Rnn2',
        epo=5,
        acc="{:2.2f}".format(h2.history["accuracy"][-1] * 100),
        valacc="{:2.2f}".format(h2.history["val_accuracy"][-1]*100),
    )
m2.model.save_weights('models/'+S+'/model_weights')

In [None]:
# export test 2
S = "lemma_{mod}_ep{epo}_acc{acc}_valacc{valacc}".format(
        mod='Rnn4',
        epo=5,
        acc="{:2.2f}".format(h5.history["accuracy"][-1] * 100),
        valacc="{:2.2f}".format(h5.history["val_accuracy"][-1]*100),
    )
m5.model.save_weights('models/'+S+'/model_weights')

In [None]:
# export test 3
S = "lemma_{mod}_ep{epo}_acc{acc}_valacc{valacc}".format(
        mod='Rnn5',
        epo=5,
        acc="{:2.2f}".format(h6.history["accuracy"][-1] * 100),
        valacc="{:2.2f}".format(h6.history["val_accuracy"][-1]*100),
    )
m6.model.save_weights('models/'+S+'/model_weights')