# 2WP Detection

In [9]:
import Corpus.gutenberg as corpus
from TextPreprocess.Tokenizer.Stanford import tokenize
from Utils.visual import hist, tally
from Utils.debug import dump
from Utils.generator import sliding_window, random_window, transform
from Utils.FS.file import save, load
from Utils.keras import compact_embedding
from Utils.misc import batch

In [10]:
from DataLoader import GloVe

WORD_DIM = 50
glove = GloVe.load2('./data/GloVe/glove.6B.{}d.txt'.format(WORD_DIM), WORD_DIM)

Start: Loading Glove Model
End: Loaded 400000 rows.


In [38]:
data = tokenize(corpus.raw().lower())
dump(data)

<class 'list'>
['-LSB-', 'emma', 'by', 'jane', 'austen', '1816', '-RSB-', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.', 'she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.', 'her', 'mother', 'had', 'died', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembrance', 'of', 'her', 'caresses', ';', 'and', 'her', 'place', 'had', 'been',

In [12]:
emb, s2i, i2s, i2v = compact_embedding(glove, data)

Size of compact embedding: 32540
Embedding coverage: 97.81%


In [13]:
from collections import Counter
gen = transform(sliding_window(2)(data), lambda x: x[0] + '_' + x[1])
detail = Counter(gen)

sorted(detail.items(), key=lambda item: item[1], reverse=True)

[(',_and', 42711),
 ('of_the', 19104),
 ('in_the', 10216),
 ('and_the', 8869),
 (';_and', 7752),
 (',_the', 7451),
 ('the_lord', 7080),
 ("._''", 5867),
 (',_that', 5741),
 (',_i', 5684),
 ('to_the', 5401),
 ('._``', 5233),
 ("''_``", 4887),
 (",_''", 4759),
 (',_as', 3704),
 ('and_he', 3588),
 (',_but', 3586),
 ('all_the', 3549),
 ('to_be', 3468),
 (',_to', 3317),
 (',_which', 3277),
 (':_and', 3275),
 ("''_said", 3175),
 ('him_,', 3172),
 (',_he', 3122),
 ('for_the', 3039),
 (',_``', 3027),
 ('shall_be', 2790),
 ('it_was', 2784),
 ('._i', 2778),
 ('on_the', 2740),
 (';_but', 2730),
 (',_in', 2706),
 ('._the', 2697),
 ('of_his', 2687),
 ('from_the', 2681),
 ('it_is', 2637),
 ('out_of', 2619),
 ('i_will', 2534),
 (',_or', 2492),
 ('i_am', 2440),
 ('and_i', 2424),
 ('with_the', 2394),
 ('i_have', 2358),
 ('and_they', 2352),
 (',_with', 2330),
 ("?_''", 2324),
 ('them_,', 2318),
 ('me_,', 2314),
 ('said_,', 2309),
 ('by_the', 2226),
 ('at_the', 2210),
 ('``_i', 2151),
 ('._he', 2115),
 (

In [14]:


#gen = random_window(2)(data)

#dump(gen, 10)

In [15]:
"""
X=[]
y=[]
for i in range(50):
    question = next(gen)
    X.append(question)
    y.append([float(input(question))])
"""

'\nX=[]\ny=[]\nfor i in range(50):\n    question = next(gen)\n    X.append(question)\n    y.append([float(input(question))])\n'

In [16]:
"""
save('./data/2WP_Seed_X.pkl', X)
save('./data/2WP_Seed_y.pkl', y)
"""

"\nsave('./data/2WP_Seed_X.pkl', X)\nsave('./data/2WP_Seed_y.pkl', y)\n"

In [17]:
X = load('./data/2WP_Seed_X.pkl')
y = load('./data/2WP_Seed_y.pkl')

In [18]:
for i in [
    ['be', 'yourself'],
    ['move', 'on'],
    ['free', 'yourself'],
    ['come', 'back'],
    ['look', 'up'],
    ['dream', 'big'],
    ['start', 'living'],
    ['i', 'am'],
    ['define', 'yourself'],
    ['be', 'happy'],
    ['be', 'fearless'],
    ['accept', 'yourself'],
    ['i', 'can'],
    ['beat', 'me'],
    ['stay', 'positive'],
    ['trust', 'yourself'],
    ['work', 'hard'],
    ['be', 'honest'],
    ['game', 'on'],
    ['stay', 'strong'],
    ['try', 'again'],
    ['lets', 'go'],
    ['enjoy', 'life'],
    ['thank', 'you'],
]:
    X.append(i)
    y.append([1.])


In [22]:
X_train = batch(s2i, X)

In [62]:
from keras.layers import Input, Convolution1D, MaxPooling1D, Dense, Flatten, Dropout, Embedding,BatchNormalization
from keras.models import Model, Sequential
from keras.regularizers import l2

def create_baseline(dropout=0, branching=5):
    
    embedding_layer = Embedding(emb.shape[0],
                            emb.shape[1],
                            weights=[emb],
                            input_length=2,
                            trainable=False)

    sequence_input = Input(shape=(2,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    #print(embedded_sequences)
    #x = Convolution1D(300, 2)(embedded_sequences)
    #x = Convolution1D(600, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    #x = Convolution1D(300, branching)(x)
    
    #x = MaxPooling1D()(x)
    #x = Dropout(dropout)(x)
    
    """
    x = Convolution1D(800, 2, activation='relu')(x)
    x = MaxPooling1D()(x)
    x = Dropout(dropout)(x)
    """
    
    x = Flatten()(embedded_sequences)
    
    """
    x = Dense(3000, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    
    x = Dense(1500, activation='tanh')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    """
    
    x = Dense(10, activation='selu')(x)
    #x = BatchNormalization()(x)
    #x = Dropout(dropout)(x)
    
    #x = Dense(10, activation='selu')(x)
    #x = BatchNormalization()(x)
    #x = Dropout(dropout)(x)
    
    preds = Dense(1, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])
    return model

In [63]:
from keras_tqdm import TQDMNotebookCallback

BATCH_SIZE = 1024
DROPOUT = 0.5
BRANCHING = 2

model = create_baseline(DROPOUT, BRANCHING)
model.fit(
    x = X_train,
    y = y,
    batch_size=1, 
    epochs=200,
    #validation_split=0.2,
    shuffle=True
    #class_weight=class_weight
    #verbose=0, callbacks=[TQDMNotebookCallback()]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x2a9c40cf8>

In [65]:
data

['-LSB-',
 'emma',
 'by',
 'jane',
 'austen',
 '1816',
 '-RSB-',
 'volume',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed',
 'to',
 'unite',
 'some',
 'of',
 'the',
 'best',
 'blessings',
 'of',
 'existence',
 ';',
 'and',
 'had',
 'lived',
 'nearly',
 'twenty-one',
 'years',
 'in',
 'the',
 'world',
 'with',
 'very',
 'little',
 'to',
 'distress',
 'or',
 'vex',
 'her',
 '.',
 'she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.',
 'her',
 'mother',
 'had',
 'died',
 'too',
 'long',
 'ago',
 'for',
 'her',
 'to',
 'have',
 'more',
 'than',
 'an',
 '

In [66]:
gen = sliding_window(2)(data)

In [67]:
test = [i for i in gen]
test

[['-LSB-', 'emma'],
 ['emma', 'by'],
 ['by', 'jane'],
 ['jane', 'austen'],
 ['austen', '1816'],
 ['1816', '-RSB-'],
 ['-RSB-', 'volume'],
 ['volume', 'i'],
 ['i', 'chapter'],
 ['chapter', 'i'],
 ['i', 'emma'],
 ['emma', 'woodhouse'],
 ['woodhouse', ','],
 [',', 'handsome'],
 ['handsome', ','],
 [',', 'clever'],
 ['clever', ','],
 [',', 'and'],
 ['and', 'rich'],
 ['rich', ','],
 [',', 'with'],
 ['with', 'a'],
 ['a', 'comfortable'],
 ['comfortable', 'home'],
 ['home', 'and'],
 ['and', 'happy'],
 ['happy', 'disposition'],
 ['disposition', ','],
 [',', 'seemed'],
 ['seemed', 'to'],
 ['to', 'unite'],
 ['unite', 'some'],
 ['some', 'of'],
 ['of', 'the'],
 ['the', 'best'],
 ['best', 'blessings'],
 ['blessings', 'of'],
 ['of', 'existence'],
 ['existence', ';'],
 [';', 'and'],
 ['and', 'had'],
 ['had', 'lived'],
 ['lived', 'nearly'],
 ['nearly', 'twenty-one'],
 ['twenty-one', 'years'],
 ['years', 'in'],
 ['in', 'the'],
 ['the', 'world'],
 ['world', 'with'],
 ['with', 'very'],
 ['very', 'little']

In [68]:
X_test = batch(s2i, test)

In [69]:
y_predict = model.predict(X_test)

In [70]:
import numpy as np
X_test = np.array(X_test)

In [71]:
y_test = np.array(y_predict)

In [73]:
X_test

array([[   0,    1],
       [   1,    2],
       [   2,    3],
       ..., 
       [ 454,  470],
       [ 470, 1486],
       [1486,   44]])

In [75]:
X_test = batch(i2s, list(X_test))

In [76]:
X_test = np.array(X_test)

In [77]:
result = np.hstack((X_test,y_test))

In [80]:
sorted(result, key=lambda x: x[2], reverse=False)

[array(['managing', 'a', '0.00010001745274789225'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['and', 'isabella', '0.00010001781697999227'],
       dtype='<U32'), array(['<UNK>', 'pursued', '0.00010002351519562837'],
       dtype='<U32'), array(['grow', 'up', '0.00010002539197986258'],
       dtype='<U32'), array(['grow', 'up', '0.00010002539197986258'],
       dtype='<U32'), array(['grow', 'up', '0.00010002539197986258'],
       dtype='<U32'), array(['grow', 'up', '0.0001000253919798

In [82]:
model.predict(X_test[:100])

ValueError: invalid literal for int() with base 10: '<UNK>'