In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import load_model
import numpy as np 
import numpy as np 
import os
import pandas as pd

import re
from nltk import tokenize
import keras.utils as ku
from sklearn.model_selection import train_test_split
import pickle

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#import data
text_df = pd.read_pickle('/content/gdrive/MyDrive/files/df_parteien.pkl')

In [None]:
#choose party
partei = 'SPD'

In [None]:
#select only speeches from party
text_df = text_df.sort_values(by='Sitzung')
text_df = text_df.loc[text_df[partei] ==1 ]

In [None]:
def clean_text(doc):
    
    doc = re.sub(r'\-\\n','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\-\n','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\n',' ', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'([a-z])\-  ([a-z])', r'\1\2', doc, flags=re.DOTALL  | re.MULTILINE).strip()  
    doc = re.sub(r'(Herr|Frau)\n?\s?Präsidenti?n?[\!\.\s\,]','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'Sehr geehrter?[\!\.\s\,]','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'[Meine ]+Damen\s?\n?und\s?\n?Herren[\!\.\s\,]','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'Kollege \w+','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'Vielen Dank\,','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'[\!\:\?\;]','.', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'[\-\–\(\)\[\]]','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\\.','.', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\.\s\s','.', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\\:','', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'[\*\…\'\"\“\”\„\ʼ\’]',' ', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'   ',' ', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\d+',' ', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'  ',' ', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\s\.','.', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'\s\,',',', doc, flags=re.DOTALL  | re.MULTILINE).strip() 
    doc = re.sub(r'(\.)([A-z])', r'\1 \2', doc, flags=re.DOTALL  | re.MULTILINE).strip() 

    
    return doc
    
text_df.loc[:,'clean_text'] = text_df.loc[:,'Reden_clean'].apply(clean_text)
text_df.loc[:,'clean_text']

df_small, features_test = train_test_split(text_df,test_size=0.9, random_state=42)

In [None]:
#devide corpus in sentences
corpus_1 = []
corpus = []
for text in df_small.loc[:,'clean_text']:
    corpus_1.append(text)
    
from nltk.tokenize import sent_tokenize

for text in corpus_1:
    for sen in sent_tokenize(text):
        if len(sen) < 500:
          if len(sen) > 4:
            corpus.append(sen.lower())    

In [None]:
#tokenize words

tokenizer = Tokenizer(filters='!"#$%&()*+-/:;<=>?@[\\]^_`{|}~\t\n')

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

In [None]:
#text to sequences
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
x, labels = input_sequences[:,:-1],input_sequences[:,-1]

y = tf.keras.utils.to_categorical(labels, num_classes=total_words)



In [None]:
#train model
model = Sequential()
model.add(Embedding(total_words, 750, input_length=max_sequence_len-1))
model.add(Dropout(rate=0.3))
model.add(Bidirectional(LSTM(375, return_sequences = True)))
model.add(Dropout(rate=0.3))
model.add(Bidirectional(LSTM(225, return_sequences = True)))
model.add(LSTM(225, return_sequences = True))
model.add(LSTM(225))
model.add(Dropout(rate=0.3))
model.add(Dense(375, activation='relu'))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['Precision', 'Recall'])

print(model.summary())

early = EarlyStopping(monitor='loss', patience=50)
checkpoint = ModelCheckpoint(monitor='loss',filepath='/content/gdrive/MyDrive/files/'+partei+'.h5')
history = model.fit(x, y, epochs=650, batch_size=2048, callbacks=[checkpoint],verbose = 1) 


In [None]:
#open model
from keras.models import load_model
model = load_model('/content/gdrive/MyDrive/files/'+partei+'.h5')

In [None]:
#save model
with open('/content/gdrive/MyDrive/files/'+partei+'1.pkl', 'wb') as f: 
        pickle.dump(model, f)

In [None]:
#save tokenizer and max_sequence_len_
with open('/content/gdrive/MyDrive/files/tokenizer_'+partei, 'wb') as f: 
        pickle.dump(tokenizer, f)

with open('/content/gdrive/MyDrive/files/max_sequence_len_'+partei, 'wb') as f: 
        pickle.dump(max_sequence_len, f)

In [None]:
#test result
seed_text = ""
next_words = 60
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = np.argmax(model.predict(token_list), axis=-1)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
display(seed_text)