## In this Note book i try char level language model for producing space character after geting the word chars as an input

In [1]:
import pandas as pd
import numpy as np
import operator
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

### reading the data 

In [2]:
train = []
with open('training_data.txt',encoding='utf-8') as f:
    for line in f.readlines():
        train.append(line)
    train = [line.rstrip('\n') for line in train]

In [3]:
test = []
with open('missing_spaces.txt',encoding='utf-8') as f:
    for line in f.readlines():
        test.append(line)
    test = [line.rstrip('\n') for line in test]

In [4]:
train_df = pd.DataFrame(train, columns = ['text'])
test_df = pd.DataFrame(test, columns = ['text'])

## limiting th data to be 100K row

In [5]:
train_df = train_df[0:100000]

train_df = train_df.sample(frac=1).reset_index(drop=True)

train_df['len'] = train_df['text'].apply(lambda x : len(x.split()))

train_df = train_df[train_df['len']>1]

### normalize the laten characters to english characters

In [7]:
import unicodedata as ud
def strip_accents(text):
    return ''.join(char for char in
                   ud.normalize('NFKD', text)
                   if ud.category(char) != 'Mn')

### preprocessing the sentence by removing punctuations , replace numerical expression with NUM, remove all the non ASCII characters , limiting the sentence to 6 words only 

In [8]:
from nltk.corpus import stopwords
from transliterate import translit, get_available_language_codes
stopWords = set(stopwords.words('english'))
rn = re.compile(r'(\w?\d+.?\w?\d+|\d)[/,-,%]?(\w?\d+.?\w?\d+|\d)?')
table = str.maketrans("","")
def preprocessing(text):
    tok = text.split()
    ## limiting the sentence to 6 words only
    if len(tok)<=6:
        text = ' '.join(tok)
    else :
        text = ' '.join(tok[0:6])
    ### normalize the laten and russian.. to english chars
    try :
        text = translit(text, reversed=True)
    except :
        text = strip_accents(text)
    ### remove punctation
    text = text.translate(str.maketrans('','',string.punctuation))
    ### replace any other non ASCII charcter with < 
    text = ''.join([i if ord(i) < 128 else '<' for i in text])
    text = ' '.join([word.lower() for word in text.split()])
    ### replace numerical expressions with NE
    text = rn.sub('NUM',text)
    text = text.replace('  ',' ').replace(' ',' ')
    text = text.strip()
    return text

In [9]:
train_df['ytext'] = train_df['text'].apply(lambda x : preprocessing(x))

In [10]:
train_df['Ntext'] = train_df['ytext'].apply(lambda x : ''.join(x.split(' ')))

In [11]:
train_df

Unnamed: 0,text,len,ytext,Ntext
0,Use by the Voyager program,5,Use by the Voyager,UsebytheVoyager
1,"Vanuatu's undeveloped road system, with fewer ...",59,Vanuatus undeveloped road system,Vanuatusundevelopedroadsystem
3,Coupeville – County Seat,4,Coupeville County Seat,CoupevilleCountySeat
4,John Adams persuaded the committee to select T...,103,John Adams persuaded the,JohnAdamspersuadedthe
5,Mosel (19–24 May 1943),4,Mosel NUM May NUM,MoselNUMMayNUM
6,Elvis Presley holds his last concert at Market...,12,Elvis Presley holds his,ElvisPresleyholdshis
7,"October 2 – William Berger, Austrian-born Amer...",10,October NUM William,OctoberNUMWilliam
8,Integral 9-man lock-out chamber.,4,Integral NUMman lockout chamber,IntegralNUMmanlockoutchamber
9,"On ""The Great Food Truck Race"", a Vietnamese s...",23,On The Great Food,OnTheGreatFood
10,Transfer of the East Jerusalem Palestinian pop...,28,Transfer of the East,TransferoftheEast


In [12]:
raw_text = ' '.join(train_df['ytext'])

In [None]:
from keras.models import Sequential
from keras.layers import Input
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Model
import numpy as np
import random
import sys

import IPython


class CharacterLevelLSTM:
    """
    character level model LSTM
    """
    def __init__(self, input_shape, hidden_len, output_len, return_sequence=True):
        self.input_shape = input_shape
        self.hidden_len = hidden_len
        self.output_len = output_len

    def build(self, dropout=0.2):
        """
        Stacked LSTM with specified dropout rate
        :param dropout: dropout rate
        :return: model built with softmax activation, cross entropy loss and rmsprop optimizer
        """
        # 2 layer LSTM with specified number of nodes in the hidden layer.
        
        inputs = Input(shape=(self.input_shape[1],))
        encoded_inputs= Embedding(self.output_len, self.output_len+1)(inputs)
        ls = LSTM(self.hidden_len, return_sequences=True)(encoded_inputs)
        dr = Dropout(dropout)(ls)
        ls2 = LSTM(self.hidden_len,return_sequences=False)(dr)
        dr2 = Dropout(dropout)(ls2)
        output = Dense(self.output_len,activation='softmax')(dr2)
        model = Model(inputs=inputs, outputs=output)
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
        return model

    def sample(self, a, temperature=1.0):
        """
        softmax function for reinforcement learning
        :param a: prob array
        :param temperature: affects diversity
        :return:
        """
        a = np.log(a) / temperature
        dist = np.exp(a)/np.sum(np.exp(a))
        choices = range(len(a))
        return np.random.choice(choices, p=dist)


def get_data():
    """
    retrieves data from a plain txt file and formats it
    using 1-of-k encoding
    :return: relevant data extracted
    """
    # should be plain txt file
    text = raw_text.lower()

    # vocab
    chars = set(text)
    print("total chars: ", len(chars))
    char_to_indices = dict((char, idx) for idx, char in enumerate(chars))
    indices_to_chars = dict((idx, char) for idx, char in enumerate(chars))

    # separate into array of sentences (max 20 chars)
    max_len = 20
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - max_len, step):
        sentences.append(text[i: i + max_len])
        next_chars.append(text[i + max_len])
    print("total # of sentences: ", len(sentences))

    # 1-of-k encoding (all zeros except for a single one at
    # the index of the character in the vocab)
    # all input sentences encoded
    x = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    # expected outputs for each sentence
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            # mark the each corresponding character in a sentence as 1
            x[i, char_to_indices[char]] = 1
        # mark the corresponding character in expected output as 1
        y[i, char_to_indices[next_chars[i]]] = 1

    return text, max_len, len(chars), char_to_indices, indices_to_chars, x, y


def train():
    """
    Trains the network and outputs the generated text.
    Trains using batch size of 100, 60 epochs total.
    :return:
    """
    text, max_len, input_len, char_to_indices, indices_to_chars, x, y = get_data()
    print(x.shape,y.shape)
    # two layered LSTM 100 hidden nodes and a dropout rate of 0.5
    lstm = CharacterLevelLSTM(x.shape, 100, input_len)
    print("Building Model...")
    # IPython.embed()
    model = lstm.build(dropout=0.5)

    # train model and output generated text
    for iteration in range(1, 2):
        print("==============================================================")
        print("Iteration: ", iteration)
        model.fit(x, y,validation_split = 0.2, batch_size=128, nb_epoch=1,verbose = 1)

        start_index = random.randint(0, len(text) - max_len - 1)
        for T in [0.2, 0.5, 1.0, 1.2]:
            print("------------Temperature", T)
            generated = ''
            sentence = text[start_index:start_index + max_len]
            generated += sentence
            print("Generating with seed: " + sentence)
            sys.stdout.write(generated)

            # generate 400 chars
            for i in range(400):
                seed = np.zeros((1, max_len, input_len))
                # format input
                for t, char in enumerate(sentence):
                    seed[0, t, char_to_indices[char]] = 1

                # get predictions
                # verbose = 0, no logging
                predictions = lstm.model.predict(seed, verbose=0)[0]
                next_index = lstm.sample(predictions, T)
                next_char = indices_to_chars[next_index]
                # print next char
                print(next_char)

                # use current output as input to predict the next character
                # in the sequence
                generated += next_char
                sentence = sentence[1:] + next_char
            print()
    return model

if __name__ == '__main__':
    ob = train()

In [None]:
model.save_weights('./model.h5',overwrite=True)
inf = CharacterLevelLSTM(x.shape, 100, input_len)
inference_model = inf.build(dropout=0.5)
inference_model.load_weights('./tpu_model.h5')

In [None]:
for T in [0.2, 0.5, 1.0, 1.2]:

    sentence ='watchthisspacefestiv'

    seed = np.zeros((1, max_len, input_len))
    # format input
    word = sentence[0]
    pred_sec = ' '
    seed[0, 0, char_to_indices[word]] = 1
    for t, char in enumerate(sentence[1:]):
      # get predictions
        # verbose = 0, no logging
        predictions = inference_model.predict(seed, verbose=0)[0]
        next_index = inf.sample(predictions, T)
        next_char = indices_to_chars[next_index]
        if next_char == ' ':
            pred_sec += word
            word = ' '
        else :
            word = word+char
        seed[0, t+1, char_to_indices[char]] = 1
    print(pred_sec)
            