# Part 2 Text generation language model

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib


from time import time 
 
import pandas as pd
import numpy as np
from collections import Counter
import time
import string
import re
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros

import gensim
from gensim.models import Word2Vec 
import random

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords


import spacy
import talos
import pickle

from sklearn.pipeline import Pipeline
from random import randint
from pickle import load


import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot


from sklearn  import preprocessing
from sklearn.model_selection   import cross_val_score, GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.metrics           import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing     import LabelEncoder



from tensorflow import keras
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding, LSTM, Bidirectional
from keras.layers.convolutional import Conv1D, MaxPooling1D

The second part of this work is to train a text generation language model, where a model trained on the news article titles of “sports” topic

Training dataset loading, assigning new column names

In [3]:
df = pd.read_csv('train.csv', header=None, na_values=None, names = ['label', 'article_title', 'article_text'])

Mapping label numbers to the news topic category

In [4]:
df['category_name']=df['label'].map({1:'world', 2:'sports',3:'business', 4:'sci/tech' })

Extracting rows with the “sports” category from the training dataset.

In [5]:
sport_training = df[df['category_name'] == 'sports'].reset_index(drop=True)

Next, extracted data has been lowercased and cleaned from punctuations, non-alphabetical words and URLs with clean_doc2 function and stored to the "sport_clean_data" list

In [6]:
def remove_urls (txt):
    txt = re.sub(r'http\S+', '', txt)
    txt = re.sub(r'target\S+', '', txt)
    txt = re.sub(r'qtype\S+', '', txt)
    txt = re.sub(r'qcat\S+', '', txt)
    txt = re.sub(r'&lt;\S+', '', txt)
    return(txt)

In [7]:
def clean_doc2(doc):
    # split into tokens by white space
    tokens = doc.split()
    tokens = [word.lower() for word in tokens]
    tokens = [remove_urls(word) for word in tokens]
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

In [8]:
sport_clean_data = []
for i in range(0, len(sport_training)):
    clean_t=clean_doc2(sport_training['article_title'][i])
    sport_clean_data.append(clean_t)

Then, cleaned text has been tokenized with 60,651 total and 9,580 unique tokens

In [9]:
sport_tokens=[]
for w in sport_clean_data:
    for l in w:
        sport_tokens.append(l)

In [10]:
print(f'Total Tokens: {len(sport_tokens)}')
print(f'Unique Tokens: {len(set(sport_tokens))}')

Total Tokens: 60462
Unique Tokens: 9579


Finally, tokenized text has been organized into sequences with 51 words each, where first 50 words processed as predictors and the last token as label.

In [11]:
# function to load text data
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text.split('\n')

In [6]:
#loading sequences
#sequences = load_doc('sequences.txt')

In [11]:
length = 50 + 1
sequences = list()
for i in range(length, len(sport_tokens)+1):
    seq = sport_tokens[i - length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 60412


In [13]:
sequences[0:5]

['wizards guard blake out week ap japan celebrates suzuki breaking record on world stage no one else compares to els mismatch turns competitive as heninhardenne faces rookie serena blames headache for linz upset ap sales sun surge ahead supreme court stays dalmiya election oklahoma rallies behind white hamm gets to keep',
 'guard blake out week ap japan celebrates suzuki breaking record on world stage no one else compares to els mismatch turns competitive as heninhardenne faces rookie serena blames headache for linz upset ap sales sun surge ahead supreme court stays dalmiya election oklahoma rallies behind white hamm gets to keep gold',
 'blake out week ap japan celebrates suzuki breaking record on world stage no one else compares to els mismatch turns competitive as heninhardenne faces rookie serena blames headache for linz upset ap sales sun surge ahead supreme court stays dalmiya election oklahoma rallies behind white hamm gets to keep gold medal',
 'out week ap japan celebrates suz

In [15]:
# function to save text data
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [16]:
# saving sequences to file
#save_doc(sequences, 'sequences.txt')

The last preprocessing step is to encode obtained 60,601 word sequences to integer numbers and labels to one-hot vectors.

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
tokenized_sequences = tokenizer.texts_to_sequences(sequences)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:', vocab_size)

Vocabulary size: 9580


In [21]:
#saving tokenizer
#joblib.dump(tokenizer, 'model_gen_tokenizer.pkl')

['model_gen_tokenizer.pkl']

In [15]:
tokenized_sequences = array(tokenized_sequences)
X, y = tokenized_sequences[:, :-1], tokenized_sequences[:, -1]
y = keras.utils.to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

After preprocessing steps, input and label vectors X, y might be processed through a Deep Neural Network model.
The text generation language model was trained with Bidirectional LSTM with 256 neurons and fully connected Dense layer with 128 neurons.

Dropout regularisation layer with 0.2 rate was added to prevent overfitting

The embedding vector space was feeded with 9,581 vocabulary size and the lenght of the input sequences equal 50. Each word was represented in the 10-dimensional vector space. 

The output layer determined with the 9,581 nodes (vocabulary size) and softmax activation function, which represents outputs as probabilities

Finally, the model had been training with 60 epochs and the batch size of 256 for about 2.5 hours

In [24]:
model_gen4 = Sequential()
model_gen4.add(Embedding(vocab_size, 50, input_length=seq_length))
model_gen4.add(Bidirectional(LSTM(128)))
model_gen4.add(Dropout(0.2))
model_gen4.add(Dense(128, activation='relu'))
model_gen4.add(Dense(vocab_size, activation='softmax'))

model_gen4.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model_gen4.summary()

start = time.time()

model_gen4.fit(X, y, batch_size = 256, epochs = 60, workers = 8)

elapsed = time.time() - start
print('\n Completed training in {} seconds.'.format(elapsed),end='\n\n')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            479000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 9580)              1235820   
Total params: 1,931,012
Trainable params: 1,931,012
Non-trainable params: 0
_________________________________________________________________

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/6

In [25]:
#saving model
model_gen4.save('model_gen4.h5')

# Text generation

The language modelling model was built with a fixed length of the input text equal to 50 tokens. Therefore, in order to generate a text with this model, an input text needs to be truncated or zero-padded to the required length. In this work, the length of the sequence might be calculated as the length of any predictor of the model.

In [None]:
#loading model
model_gen4 = keras.models.load_model('model_gen4.h5')

In [None]:
#loading tokenizer
tokenizer = joblib.load('model_gen_tokenizer.pkl')

In [16]:
#loading sequences
sequences = load_doc('sequences.txt')

In [23]:
seq_length = len(sequences[0].split()) - 1

The input text for language modelling model was randomly selected with randint function

In [171]:
# select a seed text

random.seed(2)
seed_text = sequences[randint(0, len(sequences))]
print(seed_text + '\n')

game with toe injury journalist at centre of kenteris row is stabbed straight for ab gold helmet winners red sox starring in ads no wake forest no arizona offence comes alive dolphins get win steelers have plenty in reserve spurrier keeps quiet about s carolina job ap xm strikes back cardinals



Finally, a random text was processed through function below and then trained language modelling model generated 100 samples of words

In [24]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [172]:
# generate new text
generated = generate_seq(model_gen4, tokenizer, seq_length, seed_text, 100)
print(generated)

show their power with homers against eagles totti rout soccer bc men game in testing gamecocks gunners doping lions sign australia close through in shaky gold open semifinal fifa khan lb clinches gold chiefs in four oklahoma state former black signs journey from sven of bomb or nhl miller takes sweep of year as vikings hold their club expos early better of the capriati innings gets vote to reverse the trophy win colts brees grabs mvp award ap college angels swap mother joins test at hawaii ap pacers misses a winsa team notebook coolhead would oneyear do lead to talk


# Testing with ML and DL algorithms

In order to test the performance of the trained earlier ML and DL models on the generated samples, words in the text were organised to 4 sentences with 25 tokens each

In [173]:
splitted = generated.split()
sent=[]
#number of sentences
sent_num = 4
#number of words each sentence
word_num = 25

for word in range(0, sent_num):
    s = splitted[word*word_num:(word*word_num+(word_num-1))]
    line = ' '.join(s)
    sent.append(line)

In [174]:
sent

['show their power with homers against eagles totti rout soccer bc men game in testing gamecocks gunners doping lions sign australia close through in',
 'gold open semifinal fifa khan lb clinches gold chiefs in four oklahoma state former black signs journey from sven of bomb or nhl miller',
 'sweep of year as vikings hold their club expos early better of the capriati innings gets vote to reverse the trophy win colts brees',
 'mvp award ap college angels swap mother joins test at hawaii ap pacers misses a winsa team notebook coolhead would oneyear do lead to']

# TF-IDF, Linear SVM

Loading TF-IDF vectorizer and best performed Linear SVM algorithm using joblib package

In [46]:
vectorizer = joblib.load('tfidf_vectroizer.pkl')
model_tfidf = joblib.load('best_model_tfidf.pkl')

The next step is to apply loaded vectoriser and ML model to the generated samples. Since "sports" topic labeled with the number "2" in the training dataset, y_test with four "2" labels has been created

In [166]:
X_test_tfidf = vectorizer.transform(sent)
y_test = array(pd.Series([2]*sent_num))

In [179]:
y_pred = model_tfidf.predict(X_test_tfidf)
print('Test dataset accuracy score:', (accuracy_score(y_test, y_pred))*100,'%','\n')

Test dataset accuracy score: 100.0 % 



As a result, the model with TF-IDF vectorisation and Linear SVM algorithm performed with the accuracy score 100%, correctly predicting 4/4 sentences

# Word2vec, Logistic Regression

First, input text should be transformed into list of lists format

In [168]:
sent_w2v = [row.split() for row in sent]

Second, Word2vec model was uploaded and normalised

In [50]:
model_w2v = Word2Vec.load("word2vec.model")
model_w2v.init_sims(replace = True)

Next, the test data was created by word averaging process
Word averaging has been performed with defined by https://gist.github.com/susanli2016/dae5c9ff3cea5744822384881fc619dd#file-word_averaging
function

In [51]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [169]:
X_test_w2v = word_averaging_list(model_w2v.wv, np.array(sent_w2v))

Finally, the best performed Logistic Regression ML model was uploaded and applied to the generated samples

In [53]:
best_mod_w2v = joblib.load('best_w2v.pkl')

In [177]:
y_pred_w2v = best_mod_w2v.predict(X_test_w2v)
print('Test dataset accuracy score:', (accuracy_score(y_test, y_pred_w2v))*100,'%','\n')

Test dataset accuracy score: 100.0 % 



The model with Word2vec vectorisation and Logistic Regression algorithm achieved the accuracy score 100%, correctly predicting 4/4 sentences

# C-LSTM model

Uploading keras tokenizer pkl file and applying to the generated samples. 

In [81]:
tokenizer_C_LSTM = joblib.load('C_LSTM_tokenizer.pkl')

In [82]:
encoded_docs_C_LSTM = tokenizer_C_LSTM.texts_to_sequences(sent)

Before uploading to the C-LSTM model, encoded test dataset was zero-padded with 90 (maximum sentence length on the training dataset). 

In [83]:
Xtest = pad_sequences(encoded_docs_C_LSTM, maxlen = 90, padding='post')

Additionally, to calculate the performance, y_test was one-hot encoded with uploaded Label encoder and Keras to_categorical tool

In [34]:
le = joblib.load('label_encoder.pkl')

In [162]:
y_test = array(pd.Series([2]*sent_num))
le1 = le.transform(y_test)
ytest = keras.utils.to_categorical(le1,num_classes=4)

In [163]:
ytest

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

Loading Keras C-LSTM model and evaluating on the generated 100 text samples

In [37]:
model_C_LSTM = keras.models.load_model('model_CNN8.h5')

In [178]:
loss, acc = model_C_LSTM.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100), '%')    

Test Accuracy: 100.000000 %


C-LSTM model performed with the accuracy score 100%

Both ML and DL algorithms have been performed with the accuracy score of 100%, correctly predicting 4/4 sentences. Such high accuracy is explained by the fact that the language modelling model and ML and DL algorithms were trained on the same training dataset. In addition, such results may be obtained due to small size of the data (100 tokens).