# Test Sum-base
Draft version.

In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
import nltk
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import logging
import os
from tensorboardX import SummaryWriter

In [2]:
wikihow = pd.read_csv("data/clean_wikihow.csv")

In [3]:
wikihow.head()

Unnamed: 0,summary,text
0,sell yourself first,anything else stop sum artist think translate ...
1,read the classics before 1600,reading classics first thing well read want bu...
2,join online artist communities,depending scale intend sell art pieces may wan...
3,make yourself public,get best advertising publish example pieces ar...
4,blog about your artwork,given hundreds free blogging websites lot choi...


## Make sure the settings are the same as your training script

In [47]:
summaries = wikihow['summary'].tolist()
texts = wikihow['text'].tolist()
print(len(summaries), wikihow.shape)

#################### CONFIG #############
# dataset range
si, ei = 0, 120000 #0, 25000
# Subset the data for training
start = 0 # 0
end = start + 100000 # 20000

# Set the Hyperparameters
epochs = 100
batch_size = 128 # 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
###########################################


1212030 (1212030, 2)


In [48]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in str(sentence).split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [49]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

count_words(word_counts, summaries)
count_words(word_counts, texts)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 336845


In [7]:
# Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better 
# (https://github.com/commonsense/conceptnet-numberbatch)
embeddings_index = {}
with open('numberbatch-en-19.08.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 516783


In [50]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 10507
Percent of words that are missing from vocabulary: 3.1199999999999997%


In [51]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 336845
Number of words we will use: 102501
Percent of words we will use: 30.43%


## Test

In [52]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jbu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
def text_to_seq(text):
    '''Prepare the text for the model'''
    text = clean_text(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

In [59]:
tests = [i for i in range(30)]

checkpoint = "checkpoints/best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    for t in tests:
        try:
            text = text_to_seq(texts[t])
            #Multiply by batch_size to match the model's input parameters
            answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                          summary_length: [np.random.randint(5,8)], 
                                          text_length: [len(text)]*batch_size,
                                          keep_prob: 1.0})[0] 

            # Remove the padding from the tweet
            pad = vocab_to_int["<PAD>"]
            # print('Original Text:', wikihow.text[random])
            # print('Original summary:', wikihow.summary[random])#clean_summaries[random]
            print('\n==================== Example {} ===================='.format(t))
            print('Original Text:')
            print(texts[t])
            print('\nOriginal Summary:')
            print(summaries[t])
            print('\nInput:')
            print('Word Ids: {}'.format([i for i in text]))
            print('{}'.format(" ".join([int_to_vocab[i] for i in text])))
            print('\nOutput Summary:')
            print('Word Ids: {}'.format([i for i in answer_logits if i != pad]))
            print('{}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))
        except:
            pass

INFO:tensorflow:Restoring parameters from checkpoints/best_model.ckpt

Original Text:
get best advertising publish example pieces around web show demonstrate style sure add watermark digital version protect artwork art thieves spend little time researching online ways artists promoting works like promotional strategies want avoid type research give lot ideas also alert potential pitfalls ways promote artwork bookmark sites artists selling online really inspire come back regularly see evolving succeeding seize power twitter facebook increase people knowledge tweet updates new paintings thoughts art news items art general facebook place photos artwork digitally watermarked photos receiving awards information art artwork general perhaps even critiques artwork

Original Summary:
make yourself public 

Input:
Word Ids: [109, 127, 17443, 184, 1004, 3852, 534, 9342, 1530, 7002, 183, 26, 268, 21254, 2937, 6020, 806, 17, 30, 23979, 549, 520, 322, 8606, 9, 539, 14962, 10398, 758, 115, 13920, 163


Original Text:
online art business needs built little little much putting together company indeed treat online art sales business – try make name known least give people hint artist develop services grow gain reputation time goes may feel like long time beginning solid patient foundations set good future well maintaining positive attitude attend many relevant art shows show work shows juried display awards art part profiles website backgrounds

Original Summary:
expect this to be a gradual process and do not expect to sell a lot right away 

Input:
Word Ids: [9, 30, 122, 514, 4089, 520, 520, 301, 2799, 843, 2842, 654, 1038, 9, 30, 10515, 122, 4972, 430, 12, 50, 2322, 314, 523, 113, 17459, 10, 180, 3427, 95, 1915, 15030, 322, 5410, 533, 475, 115, 1287, 322, 2184, 2969, 713, 17838, 481, 23, 722, 232, 1667, 498, 853, 1922, 839, 5041, 30, 107, 1530, 165, 107, 28932, 3483, 18650, 30, 1244, 12142, 463, 14686]
online art business needs built little little much putting together company indeed