# Import

In [1]:
import numpy as np
import pandas as pd

In [33]:
# Text Preprocessing
import string #
import re # Regular Expression
import pprint # Pretty Print for long texts
from random import randint
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [3]:
# Feature Engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [44]:
# Text Classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import tree # Decision Tree
from sklearn import ensemble # Random Forest
from sklearn import linear_model, preprocessing # Linear Regression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional

In [5]:
# Visual
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# File system
import os
from os import listdir
from os.path import isfile, join
# Warning
import warnings
warnings.filterwarnings('ignore')

# Read Data

In [9]:
def loading(path):
    inputfile = os.path.join(path)
    with open(inputfile) as f:
        book = f.read()
        return book

path = 'CSV_file/book/'
bookfiles = [f for f in listdir(path) if isfile(join(path, f))]
bookfiles = bookfiles[1:]

books = []
for i in bookfiles:
    books.append(loading(path+i))

In [11]:
books[0][:200]

'{\\rtf1\\ansi\\ansicpg1252\\cocoartf1404\\cocoasubrtf470\n{\\fonttbl\\f0\\fmodern\\fcharset0 Courier;}\n{\\colortbl;\\red255\\green255\\blue255;\\red0\\green0\\blue0;}\n\\margl1440\\margr1440\\vieww10800\\viewh8400\\viewkind'

# Exploratory Data Analysis

# Text Data Preprocessing

## Clean text

In [12]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub(r'\n', ' ', text) # removing the /n from the above text
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]', '', text) # removing teh above seen \\ from the text
    text = re.sub('a0', '', text)
    text = re.sub('\'92t', '\'t', text)
    text = re.sub('\'92s', '\'s', text)
    text = re.sub('\'92m', '\'m', text)
    text = re.sub('\'92ll', '\'ll', text)
    text = re.sub('\'91', '', text)
    text = re.sub('\'92', '', text)
    text = re.sub('\'93', '', text)
    text = re.sub('\'94', '', text)
    text = re.sub('\.', '. ', text)
    text = re.sub('\!', '! ', text)
    text = re.sub('\?', '? ', text)
    text = re.sub(' +', ' ', text)
    return text

book_clean = []
for i in books:
    book_clean.append(clean_text(i))

In [13]:
book_clean[0][:200]

'rtf1ansiansicpg1252cocoartf1404cocoasubrtf470 fonttblf0fmodernfcharset0 Courier; colortbl;red255green255blue255;red0green0blue0; margl1440margr1440vieww10800viewh8400viewkind0 deftab720 pardpardeftab7'

## Extract Vocabulary

In [14]:
vocab = {}
count = 0
for i in book_clean:
    for char in i:
        if char not in vocab:
            vocab[char] = count
            count += 1
            
vocab            

{'r': 0,
 't': 1,
 'f': 2,
 '1': 3,
 'a': 4,
 'n': 5,
 's': 6,
 'i': 7,
 'c': 8,
 'p': 9,
 'g': 10,
 '2': 11,
 '5': 12,
 'o': 13,
 '4': 14,
 '0': 15,
 'u': 16,
 'b': 17,
 '7': 18,
 ' ': 19,
 'l': 20,
 'm': 21,
 'd': 22,
 'e': 23,
 'h': 24,
 'C': 25,
 ';': 26,
 'v': 27,
 'w': 28,
 '8': 29,
 'k': 30,
 'x': 31,
 'T': 32,
 'P': 33,
 'j': 34,
 'G': 35,
 'E': 36,
 'B': 37,
 'A': 38,
 'K': 39,
 ',': 40,
 'y': 41,
 'L': 42,
 '.': 43,
 'Y': 44,
 '-': 45,
 ':': 46,
 '/': 47,
 'R': 48,
 'D': 49,
 'J': 50,
 '9': 51,
 '3': 52,
 'W': 53,
 'H': 54,
 'M': 55,
 'F': 56,
 'X': 57,
 'S': 58,
 'U': 59,
 'O': 60,
 'I': 61,
 'N': 62,
 'q': 63,
 "'": 64,
 '"': 65,
 '?': 66,
 '!': 67,
 'z': 68,
 'V': 69,
 'Z': 70,
 '6': 71,
 'Q': 72,
 '&': 73,
 '$': 74}

In [15]:
codes = ['<PAD>', '<EOS>', '<GO>'] # add occurance of these codes
for code in codes:
    vocab[code] = count
    count += 1

## Inverse Vocabulary

In [16]:
# Reversing the dictionary as
int_vo = {}
for i, j in vocab.items():
    int_vo[j] = i
    
int_vo

{0: 'r',
 1: 't',
 2: 'f',
 3: '1',
 4: 'a',
 5: 'n',
 6: 's',
 7: 'i',
 8: 'c',
 9: 'p',
 10: 'g',
 11: '2',
 12: '5',
 13: 'o',
 14: '4',
 15: '0',
 16: 'u',
 17: 'b',
 18: '7',
 19: ' ',
 20: 'l',
 21: 'm',
 22: 'd',
 23: 'e',
 24: 'h',
 25: 'C',
 26: ';',
 27: 'v',
 28: 'w',
 29: '8',
 30: 'k',
 31: 'x',
 32: 'T',
 33: 'P',
 34: 'j',
 35: 'G',
 36: 'E',
 37: 'B',
 38: 'A',
 39: 'K',
 40: ',',
 41: 'y',
 42: 'L',
 43: '.',
 44: 'Y',
 45: '-',
 46: ':',
 47: '/',
 48: 'R',
 49: 'D',
 50: 'J',
 51: '9',
 52: '3',
 53: 'W',
 54: 'H',
 55: 'M',
 56: 'F',
 57: 'X',
 58: 'S',
 59: 'U',
 60: 'O',
 61: 'I',
 62: 'N',
 63: 'q',
 64: "'",
 65: '"',
 66: '?',
 67: '!',
 68: 'z',
 69: 'V',
 70: 'Z',
 71: '6',
 72: 'Q',
 73: '&',
 74: '$',
 75: '<PAD>',
 76: '<EOS>',
 77: '<GO>'}

## Extract Sentences

In [18]:
sentences = []
for i in book_clean:
    for sen in i.split('.'):
        sentences.append(sen + '.')

sentences[0]

'rtf1ansiansicpg1252cocoartf1404cocoasubrtf470 fonttblf0fmodernfcharset0 Courier; colortbl;red255green255blue255;red0green0blue0; margl1440margr1440vieww10800viewh8400viewkind0 deftab720 pardpardeftab720sl280partightenfactor0 f0fs24 cf2 expnd0expndtw0kerning0 outl0strokewidth0 strokec2 The Project Gutenberg EBook of Anna Karenina, by Leo Tolstoy This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.'

## Convert Sentence to Integer

In [19]:
# integer numbers are from vocab
final_sentence = []
for i in sentences:
    b = []
    for char in i:
        b.append(vocab[char])
    final_sentence.append(b)

In [26]:
# as in this assinging the values to the character in sentences
final_sentence[0][:10]

[0, 1, 2, 3, 4, 5, 6, 7, 4, 5]

# Text Classification

## Model Selection

In [24]:
train, test = train_test_split(final_sentence, test_size=0.25, random_state=2)
print(len(train))
print(len(test))

99215
33072


In [27]:
maxt = max([len(sentence) for sentence in train])
print(maxt)

8907


In [28]:
# Selecting the sentence between length of 10 to 300 as to make training faster
train_sort = []
min_length = 10
max_length = 300
for i in range(min_length, max_length+2):
    for j in train:
        if(len(j) == i):
            train_sort.append(j)

In [29]:
maxt = max([len(sentence) for sentence in train_sort])
print(maxt)

301


In [34]:
# Now we generate noise in sentence (word by word) to make some random changes into training dataset
# so model can justify the changes and work accordingly
letter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def noise(sentence, threshold):
    noisy = []
    i = 0
    while i < len(sentence):
        rand = np.random.uniform(0, 0.9, 1)
        if rand < threshold:
            noisy.append(sentence[i])
        else:
            new_rand = np.random.uniform(0, 0.9, 1)
            if new_rand > 0.67:
                if i == (len(sentence) - 1):
                    continue
                else:
                    i += 1
            elif new_rand < 0.33:
                random_letter = np.random.choice(letter, 1)[0]
                noisy.insert(vocab[random_letter])
                noisy.insert(sentence[i])
            else:
                pass
        i += 1
    return noisy

In [35]:
# create noise sentence by sentence

letter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def nois(sentence, threshold):
    noisy = []
    i = 0
    while i < len(sentence):
        rand = np.random.uniform(0, 0.9, 1)
        if rand < threshold:
            noisy.append(sentence[i])
        else:
            random_letter = np.random.choice(letter, 1)[0]
            a = vocab[random_letter]
            c = len(sentence)
            random_index = randint(0, c)
            noisy.insert(random_index, a)
        i += 1
    return noisy

In [36]:
f = nois(train_sort[0], 0.8) # sample generate noise in first sentence with threshold of 0.8
print(f)
print(train_sort[0])

[19, 33, 23, 2, 7, 13, 1, 1, 41, 43]
[19, 33, 23, 10, 10, 13, 1, 1, 41, 43]


In [37]:
ef = int_vo[5]
print(ef)

n


In [38]:
for i in train_sort[0:10]: # length of first 10 sentence of train_sort
    print(len(i))
print(len(train_sort))

10
10
10
10
10
10
10
10
10
10
87923


In [39]:
noisy_train = [] #noised train sorted data
b = 0.9 # threshold
for sentence in train_sort:
    f = nois(sentence, b)
    noisy_train.append(f)

In [40]:
for i in noisy_train[:10]:
    print(len(i))

10
10
10
10
10
10
10
10
10
10


In [42]:
maxt = max([len(sentence) for sentence in noisy_train])
print(maxt)
print(noisy_train[0])
print(train_sort[0])

301
[19, 33, 23, 10, 10, 13, 1, 1, 41, 43]
[19, 33, 23, 10, 10, 13, 1, 1, 41, 43]


## Pad sentence

In [45]:
def pad_sentence(batch):
    '''Pad sentences with <PAD> so that each sentence has the same length for both train_sort & noisy_train'''
    max_sentence = max([len(sentence) for sentence in batch])
    return [sentence + [vocab['<PAD>']] * (max_sentence - len(sentence)) for sentence in batch]

In [46]:
pad_train = np.array(pad_sentence(train_sort))
pad_noisy_train = np.array(pad_sentence(noisy_train))

In [47]:
print(pad_train[2].shape)
print(pad_noisy_train[2].shape)

(301,)
(301,)


# Model

In [48]:
model = Sequential()
model.add(LSTM(40, return_sequences=True, input_shape=(None, 1)))
model.add(LSTM(20, return_sequences=True))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc']) #loss functionality
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 40)          6720      
                                                                 
 lstm_1 (LSTM)               (None, None, 20)          4880      
                                                                 
 time_distributed (TimeDistr  (None, None, 1)          21        
 ibuted)                                                         
                                                                 
Total params: 11,621
Trainable params: 11,621
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
# normalizing the pad_train & pad_noisy_train
pd_train = preprocessing.normalize(pad_train)
pd_noisy_train = preprocessing.normalize(pad_noisy_train)

In [53]:
y = pd_train.shape
a = pd_train.reshape(y[0], y[1], 1)
print(a[1][:10])

[[0.01482393]
 [0.02886765]
 [0.        ]
 [0.01014269]
 [0.02184579]
 [0.00390103]
 [0.01560413]
 [0.01014269]
 [0.02184579]
 [0.03354889]]


In [54]:
# similarly with noise data
z = pd_noisy_train.shape
b = pd_noisy_train.reshape(z[0], z[1], 1)
print(b[1][:10])

[[0.01482393]
 [0.02886765]
 [0.        ]
 [0.01014269]
 [0.02184579]
 [0.00390103]
 [0.01560413]
 [0.01014269]
 [0.02184579]
 [0.03354889]]


In [55]:
model.fit(a, b, batch_size=10, epochs=40, verbose=0)

In [None]:
# same procedure for Testing data
test_sort = []
min_length = 10
max_length = 300
for i in range(min_length, max_length+2):
    for j in train:
        if (len(j) == i):
            test_sort.append(j)

In [None]:
noisy_test = [] #noised test_sorted data
b = 0.9 # threshold
for sentence in train_sort:
    f = nois(sentence, b)
    noisy_test.append(f)

In [None]:
pad_test = np.array(pad_sentence(test_sort))
pad_noisy_test = np.array(pad_sentence(noisy_test))

In [None]:
# normalizing the pad_train & pad_noisy_train
pd_test = preprocessing.normalize(pad_test)
pd_noisy_test = preprocessing.normalize(pad_noisy_test)

In [None]:
y = pd_test.shape
a1 = pd_test.reshape(y[0], y[1], 1)
print(a1[1][:10])

In [None]:
# similarly with noise data
z = pd_noisy_test.shape
b1 = pd_noisy_test.reshape(z[0], z[1], 1)
print(b1[1][:10])

In [None]:
loss, acc = model.evaluate(a1, b1, verbose=0)
print('Loss: %f, Accuracy: %f' %(loss, acc*100))

In [None]:
for _ in range(10):
    yhat = model.predict_classes(X, verbose=0)