# Phases C & D - Text generation using DL

In [25]:
import pandas as pd

# Dividing to content per class
dataMapPath = 'dataSet/data-mapping.csv'
dataMap = pd.read_csv(dataMapPath)
classContents = {}
dataClasses = pd.unique(dataMap['Class'])
for c in dataClasses:
    df = dataMap.loc[dataMap['Class'] == c]
    classContents[c] = df['Content'][:4]

In [14]:
vocabulary_size = 400
unknown_token = "UNKNOWNTOKEN"
sentence_start_token = "SENTENCESTART"
sentence_end_token = "SENTENCEEND"
new_line_token = "NEWLINE"
separator_token = "SEPARATOR"

In [15]:
from keras.preprocessing.text import text_to_word_sequence

# Preprocessing the different texts
text2word_dict = {}
for cls in classContents:
    text2word_list = list()
    for cont in classContents[cls]:
        text = cont
        text = text.replace('\\', '').replace('\n', ' ' + new_line_token + ' ')
        text = text.replace('--', ' ' + separator_token + ' ')
        end_start_token = sentence_end_token + ' ' + sentence_start_token
        text = text.replace('.', ' ' + end_start_token + ' ')
        text2word = text_to_word_sequence(text, lower=False, )
        text2word_list += text2word
    text2word_dict[cls] = text2word_list

In [16]:
from keras.preprocessing.text import Tokenizer
token = Tokenizer(num_words=vocabulary_size, char_level=False)
mtx_dict = {}
tokens_dict ={}
for c in text2word_dict:
    txt = text2word_dict[c]
    token.fit_on_texts(txt)
    tokens_dict[c] = token
    text_mtx = token.texts_to_matrix(txt, mode='binary')
    mtx_dict[c] = text_mtx

In [17]:
input_output_dict = {}
for c in mtx_dict:
    txt_mtx = mtx_dict[c]
    inpt = txt_mtx[:-1]
    outpt = txt_mtx[1:]
    input_output_dict[c] = (inpt, outpt)

## Creating & training the DL models

In [18]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten
# from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import LSTM
from keras.layers.recurrent import SimpleRNN


model_dict = {}
for c in input_output_dict:
    inpt, outpt = input_output_dict[c]
    model = Sequential()
    model.add(Embedding(input_dim=inpt.shape[1],output_dim= 42, input_length=inpt.shape[1]))
    model.add(SimpleRNN(256, activation='relu'))
    model.add(Dense(outpt.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
    model.fit(inpt, y=outpt, batch_size=300, epochs=10, verbose=1, validation_split=0.2)
    model_dict[c] = model

Train on 1368 samples, validate on 343 samples
Epoch 1/10


 300/1368 [=====>........................] - ETA: 11s - loss: 5.4324 - acc: 0.0000e+00










Epoch 2/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.3243 - acc: 0.0533










Epoch 3/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.1094 - acc: 0.0600










Epoch 4/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.2706 - acc: 0.0533










Epoch 5/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.0557 - acc: 0.0500










Epoch 6/10


 300/1368 [=====>........................] - ETA: 10s - loss: 14.0765 - acc: 0.0200










Epoch 7/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.2706 - acc: 0.0533










Epoch 8/10


 300/1368 [=====>........................] - ETA: 11s - loss: 13.7004 - acc: 0.0367










Epoch 9/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.8616 - acc: 0.0300










Epoch 10/10


 300/1368 [=====>........................] - ETA: 10s - loss: 13.6467 - acc: 0.0467












Train on 1402 samples, validate on 351 samples
Epoch 1/10


 300/1402 [=====>........................] - ETA: 11s - loss: 4.4325 - acc: 0.0000e+00










Epoch 2/10


 300/1402 [=====>........................] - ETA: 11s - loss: 12.0348 - acc: 0.0333










Epoch 3/10


 300/1402 [=====>........................] - ETA: 11s - loss: 12.0886 - acc: 0.0367










Epoch 4/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.6050 - acc: 0.0467










Epoch 5/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.8199 - acc: 0.0433










Epoch 6/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.6588 - acc: 0.0667










Epoch 7/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.7662 - acc: 0.0233










Epoch 8/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.6588 - acc: 0.0267










Epoch 9/10


 300/1402 [=====>........................] - ETA: 11s - loss: 12.0348 - acc: 0.0300










Epoch 10/10


 300/1402 [=====>........................] - ETA: 11s - loss: 11.9811 - acc: 0.0333












Train on 947 samples, validate on 237 samples
Epoch 1/10











Epoch 2/10











Epoch 3/10











Epoch 4/10











Epoch 5/10











Epoch 6/10











Epoch 7/10











Epoch 8/10











Epoch 9/10











Epoch 10/10













Train on 1660 samples, validate on 416 samples
Epoch 1/10


 300/1660 [====>.........................] - ETA: 14s - loss: 4.3736 - acc: 0.0133












Epoch 2/10


 300/1660 [====>.........................] - ETA: 13s - loss: 12.3572 - acc: 0.0067












Epoch 3/10


 300/1660 [====>.........................] - ETA: 13s - loss: 11.7662 - acc: 0.0067












Epoch 4/10


 300/1660 [====>.........................] - ETA: 13s - loss: 11.6588 - acc: 0.0067












Epoch 5/10


 300/1660 [====>.........................] - ETA: 13s - loss: 12.1960 - acc: 0.0033












Epoch 6/10


 300/1660 [====>.........................] - ETA: 13s - loss: 12.1423 - acc: 0.0000e+00












Epoch 7/10


 300/1660 [====>.........................] - ETA: 13s - loss: 12.0886 - acc: 0.0000e+00












Epoch 8/10


 300/1660 [====>.........................] - ETA: 13s - loss: 12.0348 - acc: 0.0000e+00












Epoch 9/10


 300/1660 [====>.........................] - ETA: 13s - loss: 11.4976 - acc: 0.0033












Epoch 10/10


 300/1660 [====>.........................] - ETA: 13s - loss: 11.8199 - acc: 0.0033














Train on 1132 samples, validate on 284 samples
Epoch 1/10











Epoch 2/10











Epoch 3/10











Epoch 4/10











Epoch 5/10











Epoch 6/10











Epoch 7/10











Epoch 8/10











Epoch 9/10











Epoch 10/10













## Using the trained DL models to generate new text sequences

In [19]:
import numpy as np


def get_next_word(text, token, model, fullmtx, fulltext):
    tmp = text_to_word_sequence(text, lower=False, split=" ")
    tmp = token.texts_to_matrix(tmp, mode='binary')
    p = model.predict(tmp)
    bestmatch = np.min(np.argmax(p))
    candidates = np.where(fullmtx[:,bestmatch]>0)
    try:
        next_idx = np.min(candidates)
        return fulltext[next_idx]
    except ValueError:
        return unknown_token

In [20]:
gen_txt_dict = {}
for c in dataClasses:
    mtx = mtx_dict[c]
    m = model_dict[c]
    fulltxt = text2word_dict[c]
    listoftext = np.unique(fulltxt)
    token = tokens_dict[c]
    seq_txt = ''
    gen_txt = ''
    last_gen = ''
    for txt in listoftext:
        gen_txt = get_next_word(txt, token, m, mtx, fulltxt)
        if last_gen != '':
            fulltxt.append(last_gen)
        if gen_txt != unknown_token:
            last_gen = gen_txt
            fulltxt.remove(last_gen)
            seq_txt += ' ' + last_gen
    gen_txt_dict[c] = seq_txt

In [21]:
for c in gen_txt_dict:
    if gen_txt_dict[c] != '':
        print("Subject: %s" % c)
        print(gen_txt_dict[c])
        print()

Subject: business
 SENTENCESTART The trio are chief executive Willie Walsh chief financial officer Brian Dunne and chief operations officer Seamus Kearney SENTENCEEND SENTENCESTART The three have refused to confirm reports they plan to launch a private airline in competition with Aer Lingus SENTENCEEND SENTENCESTART They announced in November they would quit in May but did not give a reason SENTENCEEND SENTENCESTART That decision had followed an announcement by Irish Prime Minister Bertie Ahern who is still considering the future of the airline which ruled out a proposed management buy out of Aer Lingus SENTENCEEND SENTENCESTART Mr Walsh denied they had been forced out early because of the reports claiming they were set to launch a competitor airline SENTENCEEND SENTENCESTART What I do after I leave Aer Lingus is still too early to say Mr Walsh told AP news agency on Wednesday SENTENCEEND SENTENCESTART I have opportunities open to me SENTENCEEND SENTENCESTART Brian and Seamus are in th

## Using the 'Naive Bayes' classifier on the generated sequences

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X = dataMap['Content']
y = dataMap['Class_Num']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer(stop_words='english')
X_train_dtm = vect.fit_transform(X_train)

In [23]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
for c in gen_txt_dict:
    if gen_txt_dict[c] != '':
        txt_dtm = vect.transform([gen_txt_dict[c]])
        pred_class_num = nb.predict(txt_dtm)
        df = dataMap.loc[dataMap['Class_Num'] == pred_class_num[0],'Class']
        pred_class = np.unique(df)
        print("Generated text Predicted class: %s" % pred_class[0])
        print("Generated text actual class: %s" % c)
        print(c == pred_class[0])
        print()

Generated text Predicted class: business
Generated text actual class: business
True

Generated text Predicted class: tech
Generated text actual class: tech
True

Generated text Predicted class: sport
Generated text actual class: sport
True

Generated text Predicted class: politics
Generated text actual class: politics
True

Generated text Predicted class: entertainment
Generated text actual class: entertainment
True

